/* Use, modification, and distribution are subject to the Boost Software 
License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at 
www.boost.org/LICENSE_1.0.txt) */

<?php

// Intialize Global Variables
$version = 1;
$hername = "basic";
$resultsperpage = 10;
$database = "crawler3";
$heuristic = "tagphrase.php";
if(!empty($_REQUEST['hfile'])) {
  $heuristic = $_REQUEST['hfile'];
}
$debug = false;

function getmicrotime()
{
  list( $usec,$sec ) = explode( ' ',microtime() );
  return ((float)$sec + (float)$usec );
}

function normalizeWeights($weight) {
  $high = 0;
  foreach($weight as $k => $v) {
    if($v > $high) {
      $high = $v;
    }
  }
  foreach($weight as $k => $v) {
    $weight[$k] = (float)$weight[$k] / (float)$high;
  }
  return $weight;
}

function printPageSelector($urllist, $heuristic, $resultsperpage) {
  echo "<center>";
  if ($_REQUEST['pagenum'] < 1) {
    $_REQUEST['pagenum'] = 1;
  } else if($_REQUEST['pagenum'] > ceil(count($urllist) / $resultsperpage)) {
    $_REQUEST['pagenum'] = ceil(count($urllist)/$resultsperpage);
  }
	    
  if ($_REQUEST['pagenum'] > 1) {
    echo '<a href="'.$_SERVER['PHP_SELF'].'?query='.$a.'&pagenum='.($_REQUEST['pagenum']-1).'&hfile='.$heuristic.'">&laquo;Prev</a> ';
  } else {
    echo '&laquo;Prev ';
  }
    
  if ($_REQUEST['pagenum'] < 7) {
    $start = 1;
  } else if ($_REQUEST['pagenum'] > ceil(count($urllist)/$resultsperpage)-5) {
    $start = ceil(count($urllist)/$resultsperpage) - 10;
  } else {
    $start = $_REQUEST['pagenum'] - 5;
  }
    
  for ( $j=$start;$j < $start+11 && $j < ceil(count($urllist)/$resultsperpage)+1;$j++) {
    if ($_REQUEST['pagenum'] == $j) {
      echo " $j";
    } else {
      echo ' <a href="'.$_SERVER['PHP_SELF'].'?query='.$a.'&pagenum='.$j.'&hfile='.$heuristic.'"> '.$j.'</a>';
    }
  }
    
  if ($_REQUEST['pagenum'] < ceil(count($urllist)/$resultsperpage)) {
    echo ' <a href="'.$_SERVER['PHP_SELF'].'?query='.$a.'&pagenum='.($_REQUEST['pagenum']+1).'&hfile='.$heuristic.'">Next&raquo;</a>';
  } else {
    echo '&raquo;Next';
  }
  echo '</center>';
}

// Include heuristic -> function rankPages()
include $heuristic;

//Top of HTML Page
?>

<html>
<head>
<title>og/le</title>
<style type="text/css">
.listrow1 {
  background-color: #E9EEF8;
  background-image: url(images/1px_white.gif);
  background-reapeat: repeat-x;
}
.listrow2 {
  background-color: #D9E2F4;
  background-image: url(images/1px_white.gif);
  background-repeat: repeat-x;
}
</style>
</head>

<body>

<table width="95%"><tr><td>&nbsp;&nbsp;&nbsp;&nbsp;</td><td>
<font size="+4"><b>og/le</b></font><br>
<font size="+1">optimal guesswork/luck-based engine</font></td>

<td align="right">
<font size="+3"> : carleton search</font></td></tr></table>
<br><br>
<center>
<a href="http://dictionary.reference.com/search?q=ogle">About</a>
&nbsp;&nbsp;&nbsp;
<a href="instructions.html">Instructions for Testers</a>
&nbsp;&nbsp;&nbsp;
<a href="stats.php">Statistics</a>
<p>Enter your query, select your heuristic, and ogle away!
  <form name="Ogle">
<?php 
//QUERY BOX
echo '<input type="text" name="query" size="50" ';
if( !empty($_REQUEST['query']) )
{
  $a=str_replace ("\\'", "&#39;", $_REQUEST['query']);
  $a=str_replace ("\\\"","&quot;",$a);
  $a=str_replace ("\\\\","\\",$a);
  echo ' value=\''.$a.'\'';
}
echo " /><font size='-1'>";
//SELECT HEURISTIC
echo "<input type='hidden' name='pagenum' value='1'>
    <br><input type='radio' name='hfile' value='simpleheuristic.php' ";
if($heuristic == 'simpleheuristic.php') echo 'CHECKED';
echo "> Word Occurrence  ";

echo "<input type='radio' name='hfile' value='frequency.php' ";
if($heuristic == 'frequency.php') echo 'CHECKED';
echo "> Word Frequency  ";

echo "<input type='radio' name='hfile' value='freqrank.php' ";
if($heuristic == 'freqrank.php') echo 'CHECKED';
echo "> Word Frequency w/ PageRank ";

echo "<br><input type='radio' name='hfile' value='tagheuristic.php' ";
if($heuristic == 'tagheuristic.php') echo 'CHECKED';
echo "> Text Tags ";

echo "<input type='radio' name='hfile' value='tagphrase.php' ";
if($heuristic == 'tagphrase.php') echo 'CHECKED';
echo "> Text Tags w/ Phrasing  ";

echo "<input type='radio' name='hfile' value='tagrank.php' ";
if($heuristic == 'tagrank.php') echo 'CHECKED';
echo "> Text Tags w/ Phrasing and PageRank  ";

echo "<br><input type='radio' name='hfile' value='uberheuristic.php' ";
if($heuristic == 'uberheuristic.php') echo 'CHECKED';
echo "> Tags, Frequency, and Rank w/ Phrasing</font>";

//SUBMIT BUTTON
echo "<br><input type='submit' value='Ogle Carleton' />";


$link = mysql_connect("localhost","webcrawler","twlv34-1") or die("Couldn't connect to Violet:".mysql_error());
//mysql_select_db( $database ) or die( "Couldn't select database '$database'" );

?>	
</form>
</center>
<?php 

if( !empty( $_REQUEST['query'] ) ) {

  //Parse the Query.  Returns a 2-D array of words -> qlist[]
  $slashless = preg_replace("/\\\('|\")/i","\$1",$_REQUEST['query']);
  $qlist = preg_split("/\s*(\")\s*/", $slashless, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  //Remove periods from query
  $qlist = str_replace('.', '', $qlist);
  $inquotes = false;
  for($j=0;$j < count($qlist);$j++) {
    $qlist[$j] = strtolower(preg_replace("/[^\w\s_\"'-\\\]+/"," ",$qlist[$j]));
    if ($qlist[$j] == '"') {
      $inquotes = !$inquotes;
      array_splice($qlist,$j,1);
      $j=$j - 1;
    } else if (!$inquotes) {
      $temp = explode(" ",$qlist[$j]);
      for($k=0;$k <count($temp); $k++) {
	$temp[$k] = array($temp[$k]);
      }
      array_splice($qlist,$j,1,$temp);
      $j = $j + $k-1;
    } else {
      $qlist[$j] = explode(" ",$qlist[$j]);
    }
  }
   
  for($j=0; $j < count($qlist); $j++) {
    for($k=0;$k < count($qlist[$j]); $k++) {
      $escaped_qlist[$j][$k] = mysql_escape_string($qlist[$j][$k]);
    }
  }
  
  ////////////////////////////////////////////
  //Remove common words from the query

  // the 30 most common words in the english language, according to
  // http://esl.about.com/library/vocabulary/bl1000_list1.htm
  $common_words = array("the" => 1,
			"of" => 1,
			"to" => 1,
			"and" => 1,
			"a" => 1,
			"in" => 1,
			"is" => 1,
			"it" => 1,
			"you" => 1,
			"that" => 1,
			"he" => 1,
			"for" => 1,
			"on" => 1,
			"are" => 1,
			"with" => 1,
			"as" => 1,
			"i" => 1,
			"his" => 1,
			"they" => 1,
			"be" => 1,
			"at" => 1,
			"one" => 1,
			"have" => 1,
			"this" => 1,
			"from" => 1,
			"or" => 1,
			"had" => 1,
			"by" => 1,
			"hot" => 1);

  for($i=0; $i<count($qlist); $i++)
    {
      // if a word is not in quotes, (ie is alone in the phrase) 
      // it's ok to remove it
      if(count($qlist[$i]) == 1)
	{
	  if(array_key_exists($qlist[$i][0], $common_words))
	    {
	      $removed_words[] = $qlist[$i][0];
	      $removed_index[] = $i;
	    }
	}
    }
  if(count($removed_words) > 0) {
    foreach($removed_index as $r)
      {
	unset($qlist[$r]);
      }
    foreach($qlist as $phrase)
      {
	$qlist_temp[] = $phrase;
      }
    $qlist = $qlist_temp;
    $rem_print = '<p align="center"><font size="-1">';
    for($i=0; $i<count($removed_words); $i++)
      {
	$rem_print .= '<b>'.$removed_words[$i].'</b>';
	if($i == count($removed_words) - 2) {
	  $rem_print .= ' and ';
	}
	else {
	  $rem_print .= ', ';
	}
      }
    $rem_print = substr($rem_print, 0, -2);
    if(count($removed_words) == 1) {
      $rem_print .= ' is a very common word and was';
    }
    else {
      $rem_print .= ' are very common words and were';
    }
    $rem_print .= ' dropped from your search.</font></p>';
    echo $rem_print;
  }
    
  if($qlist) {
    ////////////////////////////////////////////
    //This code used by the feedback system!
		
    mysql_select_db("feedback") or die ("Couldn't select database 'feedback'");
	
    $query = 'SELECT q.qid as qid, qv.qvid as qvid 
		FROM query q LEFT JOIN query_to_version qv 
		ON (q.qid = qv.qid AND version = '.$version.' AND htype = "'.$hername.'") 
		WHERE q.term0 = "';
    for($i=0;$i < count($qlist[0]); $i++) {
      $query .= ' '.$qlist[0][$i];
    }
    
    $query .= '" ';
    for($i=1;$i < count($qlist);$i++) {
      $query .= 'AND q.term'.$i.' = "';
      for($j=0;$j < count($qlist[$i]);$j++) {
	$query .= ' '.$qlist[$i][$j];
      }
      $query .= '" ';
    }	
    if ($i < 10) {
      $query .= 'AND q.term'.$i.' IS NULL';
    }
    $query .= ';';

    $result = mysql_query($query) or die("Query Failed:".mysql_error());
    if (mysql_num_rows($result) > 0) {
      $line = mysql_fetch_array($result,MYSQL_ASSOC);
      $qid = $line['qid'];
      $qvid = $line['qvid'];
    } else {
      mysql_free_result($result);
    
      $query = 'INSERT into query (';
    
      for ($i = 0; $i < count($qlist)-1; $i++) {
	$query = $query . 'term' . $i . ', ';
      }
      $query = $query . 'term'.$i.') values(';
    
      for($i=0;$i < count($qlist)-1;$i++) {
	$query .= '"';
	for($j=0;$j < count($qlist[$i]);$j++) {
	  $query .= ' '.$qlist[$i][$j];
	}
	$query .= '", ';
      }
    
      $query .= '"';
      for($j=0; $j < count($qlist[$i]); $j++) {
	$query .= ' '.$qlist[$i][$j];
      }
      $query .= '");';
    
      $result = mysql_query($query) or die( "Insert Failed:".mysql_error() );
      $result = mysql_query('SELECT LAST_INSERT_ID() as lid') or die ('QUERY FAILED:'.mysql_error() );
      $line = mysql_fetch_array($result,MYSQL_ASSOC);
      $qid = $line['lid'];
      mysql_free_result($result);
    }
    
    if ($qvid == "") {
      $query = 'INSERT into query_to_version (qid,version,htype) values('.$qid.', '.$version.', "'.$hername.'");';
      $result = mysql_query($query) or die("INsert Failed:".mysql_error() );
      $result = mysql_query('SELECT LAST_INSERT_ID() as lid') or die ('QUERY FAILED:'.mysql_error());
      $line = mysql_fetch_array($result, MYSQL_ASSOC);
      $qvid = $line['lid'];
      mysql_free_result($result);
    }

    //End Feedback System
    /////////////////////////////////////////
	
    /////////////////////////////////////////
    //Decide whether to use existing urllist, or find another
    
    $usecache = true;
    if(file_exists("urllists/$qid")) {
      $cachedlist = fopen("/var/www/html/ogle/urllists/$qid","r");
      $cachehname = rtrim(fgets($cachedlist));
      $cachever = rtrim(fgets($cachedlist));
      if ($cachever != $version || $cachehname != $hername) {
	$usecache = false;
      } else {
	$j = 0;
	while(!feof($cachedlist)) {
	  $uid = rtrim(fgets($cachedlist));
	  $score = rtrim(fgets($cachedlist));
	  if(strlen($uid) > 0 && strlen($score) > 0) {
	    $urllist[$j] = array($uid, $score);
	  }
	  $j++;
	}
      }
      fclose($cachedlist);
    } else {
      $usecache = false;
    }

    if (!$usecache) {
      /////////////////////////////////////////
      //Include Search Heuristic Here

      $start_time = getmicrotime();
      $urllist = rankPages($qlist, $escaped_qlist, $database, $debug);
      $time = getmicrotime() - $start_time;
	
      //End Search Heauristics
      ////////////////////////////////////////

      if($urllist) {
	$newlist = fopen("/var/www/html/ogle/urllists/$qid","w+");
	fwrite($newlist, "$hername\n");
	fwrite($newlist, "$version\n");
	foreach ($urllist as $val) {
	  fwrite($newlist, "$val[0]\n");
	  fwrite($newlist, "$val[1]\n");
	}
	fclose($newlist);
      }
    
    }
  }
  
  $start_time = getmicrotime();
  echo '<h3>'.number_format(count($urllist)).' result';
  if(count($urllist) != 1) echo 's';
  echo '&nbsp;['.round( $time,2 ).' sec] using '.$hername.' v'.$version.'</h3>';

  if( count($urllist) > 0 ) {
    $qval = array();
    for($j=0;$j < count($qlist);$j++) {
      if (!array_key_exists($qlist[$j][0],$qval)) {
	$qval[$qlist[$j][0]] = 1-$j*.125;
      }
    }
    
    /////////////////////////////////////////////////////////////////////////////
    //
    //	    Top Page Selector
    printPageSelector($urllist, $heuristic, $resultsperpage);
    //
    //////////////////////////////////////////////////////////////////////////////
    
    
    /////////////////////////////////////////////////////////////////////////////
    //
    //        Print out List of Relevant Pages
    
    echo "\n\n<table>\n";
	
    mysql_select_db( $database ) or die( "Couldn't select database $database" );

    $listrow = 2;
    $ident_count = 0;
    $count = 0;
    for($i=(($_REQUEST['pagenum']*$resultsperpage)-$resultsperpage);$count<$resultsperpage&&$i<count($urllist);$i++) {
	
      $query = 'SELECT u.url as url, t.header as header, t.text as text
			FROM url u, url_to_text t
			WHERE u.urlid = '.$urllist[$i][0].' AND t.urlid=u.urlid;';
      $result = mysql_query($query) or die("Query Failed:".mysql_error());
      $line = mysql_fetch_array($result,MYSQL_ASSOC);
      mysql_free_result($result);
		
      
      ///////////////////////////////////////
      // This code used by the feedback System
					
      $title_print = '<a href="feedback.php?rank='.$i.'&qvid='.$qvid.'&url='.$line['url'].'">';
					
      // END FEEDBACK SYSTEM
      ///////////////////////////////////////

      
      $new_title = $line['header'];
      if(strlen($new_title) < 3) {
	$new_title = $line['url'];
      }
	    
      $title_print .= str_replace("\'","'",$new_title).'</a>';
      if(preg_match("/https?:\/\/[^\/]*carleton.*/i", $line['url']) == 0) {
	$title_print .= '&nbsp;<img src="globe.jpg" alt="External Link">';
      }
      $title_print .= '&nbsp;&#40;'.(round($urllist[$i][1], 2) * 100).'&#37;&#41;<br>';
      
      ///////////////////////////////////////
      //  Create Query to determine keyword dense section of page
      
      $query = 'SELECT straight_join wtu.pos as pos, word.word as word
			  FROM word_to_url wtu, word
			  WHERE wtu.wid=word.wid AND FIND_IN_SET("meta",wtu.tag)=0 AND wtu.urlid='.$urllist[$i][0].'
			  AND (word.word="';
      
      foreach ($qlist as $key => $val) {
	$query .= $val[0].'"'; 
	if ($key < count($qlist)-1) {
	  $query .= ' OR word.word="';
	}
      }
      $query .= ') ORDER by pos';

      //
      /////////////////////////////////////////
	    
      $result = mysql_query($query) or die("Query Failed:".mysql_error());
	
      $resultnum = mysql_num_rows($result);
      
      unset($parray);
      $new_score = $urllist[$i][1];
      if ($resultnum > 0) {
	for($j=0;$j < $resultnum;$j++) {
	  $posresults = mysql_fetch_array($result, MYSQL_ASSOC);
	  $parray[$j][0] = $posresults['pos'];
	  $parray[$j][1] = $posresults['word'];
	}
	$maxsum = 0;
	$currpos = 0;
	$currsum = 0;
	$maxpos = $parray[0][0];
	$maxword = $parray[0][1];
		
	for($j=0;$j < count($parray);$j++) {
	  $currpos = $parray[$j][0];
	  $currword = $parray[$j][1];
	  for ($k = $j;$k < count($parray);$k++) {
	    if ($parray[$k][0] - $currpos < 100) {
	      $currsum += $qval[$parray[$k][1]];
	    } else {
	      break;
	    }
	  }

	  if ($currsum > $maxsum) {
	    $maxsum = $currsum;
	    $maxpos = $currpos;
	    $maxword = $currword;
	  }
	  $currsum = 0;
	}
    
	$pos = $maxpos;

	for ($j = 0; $j < count($parray); $j++) {
	  if ($parray[$j][0] <= $maxpos && $parray[$j][1] == $maxword) {
	    $pos = strpos(strtolower($line['text']),$parray[$j][1],$pos);
	  }
	}

	$searchstring = "/(?i)([\W\s])(".$qlist[0][0];
	$temp = array_shift($qlist[0]);
		
	foreach ($qlist as $val) {
	  foreach ($val as $val2) {
	    $searchstring .= "|$val2";
	  }
	}
	array_unshift($qlist[0],$temp);
	$searchstring .= ")/i";
	$new_text = preg_replace($searchstring,"\$1<b>\$2</b>",str_replace("\'","'",substr($line['text'],$pos-20,500)));
	$new_text = preg_replace("/\A[\S]*\s(.*)\s[\S]*\Z/","\$1",$new_text);
      } else {
	$strlen = strlen($line['text'])/2;
	if ($strlen < 500) {
	  $new_text = substr($line['text'],0,$strlen);
	} else {
	  $new_text = substr($line['text'],$strlen,500);
	}
      }
      
      if(!($old_title == $new_title && round($old_score, 2) == round($new_score, 2))) {
	echo "\t".'<tr ';
	$listrow = ($listrow % 2) + 1;
	echo 'class="listrow'.$listrow.'">'."\n";
	echo "\t\t<td>";
	echo $title_print;
	echo '<font size="-1">';
	echo $new_text;
	echo '<p align=right>'."<a href=".$line['url'].'>'.$line['url']."</a></font><br>\n\t\t</td>"."\n";
	$count++;
	$ident_count = 0;
      } else {
	$ident_count++;
	if($ident_count < 2) {
	  echo "\t".'<tr ';
	  echo 'class="listrow'.$listrow.'">'."\n";
	  echo '<td align=right><font size="-1">';
	  echo "<a href=".$line['url'].'>'.$line['url']."</a></font><br>\n\t\t</td>"."\n";
	} else if($ident_count == 3) {
	  echo "\t".'<tr ';
	  echo 'class="listrow'.$listrow.'">'."\n";
	  echo '<td align=center><font size="-1"><i>';
	  echo "[Results similar to the pages above have been suppressed.]</i></font><br>\n\t\t</td>"."\n";
	}
      }
	    
      $old_title = $new_title;
      $old_text = $new_text;
      $old_score = $new_score;
	    
      echo "\t".'</tr>'."\n";
	
    }
    echo "\n\n</table>\n\n";

    //
    ///////////////////////////////////////////////////////////////////////////////

    //////////////////////////////////////////////////////////////////////////////
    //
    //        Bottom Page Selector
    printPageSelector($urllist, $heuristic, $resultsperpage);
    //
    ///////////////////////////////////////////////////////////////////////////
  }
}

//////////////////////////////////////////////////////////////////////////
//
//         Print out pages indexed count
mysql_select_db( $database ) or die( "Couldn't select database $database" );
$query = 'SELECT COUNT(*) AS number
	      FROM url_to_text u;';
$result = mysql_query( $query ) or die( "Query failed: ".mysql_error() );
$nurls = mysql_fetch_array( $result,MYSQL_ASSOC );
mysql_free_result($result);
mysql_close($link);
echo '<br><center><font size="-1"><p>ogling '.number_format($nurls['number']).' pages</font></center></p>';
//
/////////////////////////////////////////////////////////////////////////////
?>
<p><center><font size="-1">
<img src="streaker.gif"><br>Powered by Streaker<br><br>
&copy; 2004 Josh Allen, Andrew Drummer, Brendan Foote, Aaron Miller, and Mike Ottum</font></center></p>
</body>
</html>
