/* Use, modification, and distribution are subject to the Boost Software 
License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at 
www.boost.org/LICENSE_1.0.txt) */

<?php
//This php function implements a search heuristic that
//gives results a weight based on tags around search terms, 
//whether a page is in the carleton domain or not, and if it
//is within the carleton domain, how deep in the tree it is.

//It returns an array of uids in weight order to search.php

//author: Mike Ottum

$version = 2.6;

// v 1.00
// tag heuristic + phrasing - looking at url's
// v 1.5
// + url's
$hername = "tagphrase";

function rankPages($qlist, $escaped_qlist, $database, $debug) {
  $title_bonus = 50;
  $header_bonus = 5;
  $bold_bonus = 3;
  $underline_bonus = 3;
  $italic_bonus = 3;
  $carleton_bonus = 200;
  $search_query = $escaped_qlist;
  $url_bonus = 10;

  mysql_select_db( $database ) or die( "Couldn't select database $database" );

  $num_terms = 0;
  // phrases are counted as a single term
  foreach( $search_query as $S )
    {
      $num_terms++;
    }
  for($i=0; $i<$num_terms; $i++) {
    $complete += pow(10, $i);
  }
  $term_count = 0;

  // query the database for all the wid's in the query
  $query = 'SELECT word, wid FROM word WHERE ';
  foreach( $search_query as $S )
    {
      foreach( $S as $w )
	{
	  $query .= "word='".$w."' OR ";
	}
    }
  $query = substr($query, 0, -4);
  $query .= ';';
  if($debug) echo $query.'<br>'."\n";
  $my_time = getmicrotime();
  $result = mysql_query( $query ) or die( "Query failed: ".mysql_error()."<br>".$query);
  $time = getmicrotime() - $my_time;
  if($debug) echo "Querying: ".round($time, 2).' sec<br>'."\n";
  while($line = mysql_fetch_array($result,MYSQL_ASSOC))
    {
      $wid[$line['word']] = $line['wid'];
    }

  foreach( $qlist as $S )
    {
      // construct the phrase db query
      $query = 'SELECT u.url AS url, w2u0.urlid AS urlid, w2u0.tag AS 0_tag';
      $query .= ' FROM url u, word_to_url w2u0';
      for($i=1; $i<count($S); $i++)
	{
	  $query .= ', word_to_url w2u'.$i;
	}
      $query .= ' WHERE ';
      $count = 0;
      foreach($S as $w)
	{
	  if($wid[$w]) {
	    $query .= 'w2u'.$count.'.wid='.$wid[$w].' AND ';
	    $count++;
	  }
	}
      if($count != count($S)) {
	return array();
      }
      for($i=0; $i<count($S) - 1; $i++)
	{
	  $query .= 'w2u'.$i.'.pos=w2u'.($i+1).'.pos - 1 AND ';
	  $query .= 'w2u'.$i.'.urlid=w2u'.($i+1).'.urlid AND ';
	}
      $query .= 'u.urlid=w2u0.urlid';

      if($debug) echo '<p>'.$query.'<br>'."\n";
      $my_time = getmicrotime();
      // run the query
      $result = mysql_query( $query ) or die( "Query failed: ".mysql_error()."<br>".$query);
      $time = getmicrotime() - $my_time;
      if($debug) echo "Querying: ".round($time, 2).' sec<br>'."\n";

      $my_time = getmicrotime();
      while($line = mysql_fetch_array($result,MYSQL_ASSOC))
	{
	  // check off this word for this document
	  if($in_doc[$line['urlid']] < pow(10, $term_count)) {
	    $in_doc[$line['urlid']] += pow(10, $term_count);
	  }
	      
	  // check to see whether this document has all the search terms in it at this point
	  if($in_doc[$line['urlid']] == $complete) {
	    $complete_doc = true;
	  }
	  else {
	    $complete_doc = false;
	  }

	  if(substr_count($line['0_tag'], 'title') > 0) {
	    // check off this word for this title
	    if($in_title[$line['urlid']] < pow(10, $term_count)) {
	      $in_title[$line['urlid']] += pow(10, $term_count);
	    }
		
	    // check to see whether this title has all the search terms in it at this point
	    if($in_title[$line['urlid']] == $complete) {
	      $complete_title = true;
	    }
	    else {
	      $complete_title = false;
	    }
	    // extra bonus if all search terms are in the title
	    if($complete_title && $num_terms > 1) {
	      $temp_weight[$line['urlid']] += $title_bonus * 10;
	    }
	  }
	  if(substr_count($line['0_tag'], 'h1') > 0) {
	    $temp_weight[$line['urlid']] += $header_bonus;
	  }
	  if(substr_count($line['0_tag'], 'b') > 0) {
	    $temp_weight[$line['urlid']] += $bold_bonus;
	  }
	  if(substr_count($line['0_tag'], 'u') > 0) {
	    $temp_weight[$line['urlid']] += $underline_bonus;
	  }
	  if(substr_count($line['url'], $w) > 0) {
	    $temp_weight[$line['urlid']] += $url_bonus;
	  }
	  if(!$bonus[$line['urlid']] && preg_match("/https?:\/\/[^\/]*carleton.*/i", $line['url']) > 0) {
	    $bonus[$line['urlid']] = true;
	    $temp_weight[$line['urlid']] += $carleton_bonus - 5 * (substr_count($line['url'], '/') - 2);
	  }
	  $temp_weight[$line['urlid']]++;
	  //	  $temp_ph_count[$line['urlid']]++;
	  
	  // if this doc has all the search terms in it, add it to the $weight hash and add 1 to phrase count
	  if($complete_doc) {
	    $weight[$line['urlid']] = $temp_weight[$line['urlid']];
	    //	    $ph_count[$line['urlid']] = $temp_ph_count[$line['urlid']];
	  }
	}
      $time = getmicrotime() - $my_time;
      if($debug) echo "Iterating: ".round($time, 2).' sec</p>'."\n";
      $term_count++;
    }
//   $freq_query = 'Select distinct total
//       		     From word_to_url_summary wtus
// 		     Where ';
	
//   foreach($ph_count as $key => $val )
//     {
//       $freq_query .= 'wtus.urlid = '.$key.' or ';
//     }

//   $freq_query = substr($freq_query, 0, -3);
//   $freq_query .= ';';

//   $rs_freq = mysql_query($freq_query) or die("Couldn't get total words on page: ".mysql_error().'<br>'.$freq_query);
  //this determines frequency of the phrase in the document
//   foreach($ph_count as $key => $val )
//     {
//       $frline = mysql_fetch_array($rs_freq, MYSQL_ASSOC);
//       if($frline['total']<100) {
// 	$frline['total'] += 100;
//       }
//       $ph_count[$key] = $ph_count[$key]/$frline['total'];
//     }
  if($debug) echo "<p>";
  $my_time = getmicrotime();
  if($weight)
    {
      arsort($weight);
      $weight = normalizeWeights($weight);
      foreach($weight as $k => $v)
	{
	  $urllist[] = array($k, $v);
	}
    }
  $time = getmicrotime() - $my_time;
  if($debug) echo "Sorting: ".round($time, 2)." sec</p>\n";
  return $urllist;
}
?>
