/* Use, modification, and distribution are subject to the Boost Software 
License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at 
www.boost.org/LICENSE_1.0.txt) */

// Crawler Class
// Main class to web-crawl
// Calls Parser.parse() on URLS stored in PagesToVisit Objects

import java.util.*;
import java.sql.*;
import java.text.DecimalFormat;
import java.io.*;
import java.net.URL;

class PagesToVisit // {{{
{
    private Vector links = new Vector();
    
    public String next()
    {
	return (String) links.remove( 0 );
    }

    public String peek()
    {
	return (String) links.firstElement();
    }
    
    public void add( String link )
    {
	links.add( link );
    }

    public void addAtStart( String link )
    {
	links.add(0, link);
    }
    
    public boolean isEmpty()
    {
	return links.isEmpty();
    }
    
    public int size()
    {
	return links.size();
    }
    
    public Iterator getIterator()
    {
	return links.iterator();
    }

    public String get(int i)
    {
	if(i < links.size()) {
	    return (String) links.get(i);
	}
	else return null;
    }

    public String remove(int i)
    {
	if(i < links.size()) {
	    return (String) links.remove(i);
	}
	else return null;
    }

} // }}}

class URLStream //{{{
{
    protected String url;
    protected InputStream stream;
    protected long fetchTime;

    public URLStream(String u, InputStream s, long t)
    {
	url = u;
	stream = s;
	fetchTime = t;
    }
} //}}}

class Crawler // {{{
{
    // Variable Declaration for Crawler {{{
    
    //First off, lets have some debug variables.  Basically, set these to true to output a particular thing.
    private boolean debug_fetch = true;
    private boolean debug_fetcherror = true;
    private boolean debug_checksum = true;
    private boolean debug_parseerror = true;
    private boolean debug_unknown = true;
    private boolean debug_inserttimes = true;
    private boolean debug_indexing = true;
    
    private int indexedCount;
    private int checkCount;
    private int errorCount;
    private int fetchErrorCount;
    private int parseErrorCount;
    private int noFollowCount;
    private int SQLErrorCount;

    private double startTime;
    private Connection conn;
    private boolean quiet;

    private int cacheHits;
    private int cacheMisses;
    private int dbLookups;
    private int lcacheHits;
    private int lcacheMisses;
    private int ldbLookups;
    private boolean dbl;
    private WordCache wordCache;
    private LinkCache linkCache;
    private Quarantine quarantine;

    private PagesToVisit toDo;
    private PagesToVisit toDoLater;
    private Statement stmt;
    private ResultSet rs;
    private String tempWord;

    private HashSet checkHash;

    private long getUID = 0;
    private long gU1 = 0;
    private long gU2 = 0;
    private long gU3 = 0;
    private long gU4 = 0;
    private long gU5 = 0;
    
    // constants
    private static final String filename = "state.dat";
    private static final String separator = "9999-9999?9999-9999";
    private static final long PARSETIMEOUT = 30000;
    
    //}}}
    
    // Batch Statements Stuff {{{
    private BatchStatement stmtWordInsert;
    private PreparedStatement stmtWIDQuery;
    private BatchStatement stmtURLInsert;
    private PreparedStatement stmtUIDQuery;
    private BatchStatement stmtWord_to_URL;
    private BatchStatement stmtLink;
    private BatchStatement stmtWord_to_Link;
    private PreparedStatement stmtUrl_to_Text;
    
    private void prepareStmts()
    {
	try {
	    if(conn != null) {
		stmtWordInsert = new BatchStatement(conn.createStatement());
		stmtWIDQuery = conn.prepareStatement("SELECT * FROM word WHERE word = ? ;");
		stmtURLInsert = new BatchStatement(conn.createStatement());
		stmtWord_to_URL = new BatchStatement(conn.createStatement());
		stmtLink = new BatchStatement(conn.createStatement());
		stmtWord_to_Link = new BatchStatement(conn.createStatement());
		stmtUrl_to_Text = conn.prepareStatement("Insert into url_to_text (urlid,header,text) VALUES(?, ?, ?);");
	    }
	} catch (SQLException sql) {
	    System.err.println(sql);
	    System.exit(1);
	}
    }

    private void commitStmts() throws SQLException
    {
	stmtWordInsert.executeBatch();
	stmtURLInsert.executeBatch();
	stmtWord_to_URL.executeBatch();
	stmtLink.executeBatch();
	stmtWord_to_Link.executeBatch();
    }//}}}

    class MyShutdown extends Thread //{{{
    {
	public void run()
	{
	    crawling = false;
	    while(myRetriever != null && (myRetriever.isAlive() || !doneCrawling)) {
		try {
		    Thread.sleep(100);
		} catch (InterruptedException ie) {}
	    }
	    
	    if(toDo.size() > 0 || toDoLater.size() > 0 || URLStreamList.size() > 0) {
		System.out.print("\nSave crawl state? (y/n): ");
		try {
		    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
		    String response = in.readLine();
		    if(response.toLowerCase().startsWith("y")) {
			writeState();
		    }
		} catch(IOException ioe) {
		    System.err.println("Error: " + ioe);
		}
	    }
	    printInfo();
	}
    } //}}}

    class ShowSystemState extends Thread //{{{
    {
	public void run() {
	    while (crawling) {
		System.out.println("\n\n#####################################################");
		System.out.println("Current System State Info");
		System.out.println("#####################################################\n");
		System.out.println("\nState time:   "+((System.currentTimeMillis()-startTime)/60000)+"\n");
		System.out.println("Size of ToDo:          "+toDo.size());
		System.out.println("Size of ToDoLater:     "+toDoLater.size());
		System.out.println("Size of URLStreamList: "+URLStreamList.size()+"\n");
		System.out.println("WordInsert BatchSize:   "+stmtWordInsert.batchSize());
		System.out.println("URLInsert BatchSize:    "+stmtURLInsert.batchSize());
		System.out.println("Word_to_URL BatchSize:  "+stmtWord_to_URL.batchSize());
		System.out.println("Link BatchSize:	        "+stmtLink.batchSize());
		System.out.println("Word_to_Link BatchSize: "+stmtWord_to_Link.batchSize()+"\n");
		System.out.println("Pages Indexed:       "+indexedCount);
		System.out.println("Checksum Collisions: "+checkCount);
		System.out.println("Errors:		     "+errorCount);
		System.out.println("SQL Errors:		     "+SQLErrorCount);
		System.out.println("Fetch Errors:	     "+fetchErrorCount);
		System.out.println("Parse Errors:	     "+parseErrorCount);
		System.out.println("Links not followed:  "+noFollowCount+"\n");
		System.out.println("Unique Words:   "+cacheMisses);
		System.out.println("Word Cache Hits:"+cacheHits);
		System.out.println("Word DB Lookups:"+dbLookups+"\n");
		System.out.println("Unique URLs:    "+lcacheMisses);
		System.out.println("Link Cache Hits:"+lcacheHits+"\n");
		System.out.println("#####################################################\n\n");

		try {
		    Thread.sleep(600000);
		} catch (InterruptedException ie) {
		}
	    }
	}
    } //}}}

    private ShowSystemState mySystemState;
	    
    class ParserThread extends Thread //{{{
    {
	private InputStream myStream;
	private String curUrl;
	protected Page page;
	
	public ParserThread(String curUrl, InputStream stream) {
	    myStream = stream;
	    this.curUrl = curUrl;
	    page = null;
	}
    
	public void run()
	{
	    page = Parser.parse(curUrl, myStream);
	}
    }//}}}

    // PageRetriever Stuff {{{
    private boolean crawling;
    private boolean doneCrawling;
    private Vector URLStreamList = new Vector();

    // Retrieves pages in the background and adds them to the URLStreamList vector.
    // Pops urls off the beginning of the toDo vector to decide what to download.
    class PageRetriever extends Thread
    {
	private int MAX_STREAMLIST_SIZE = 500;
	public void run()
	{
	    long t1, time;
	    InputStream stream;
	    String url;
	    HashMap hosts = new HashMap();
	    long delay_ms;
	    String prevhost = "";
	    String host = "";
	    
	    while(crawling) {
		
		while(((toDo.isEmpty() && toDoLater.isEmpty()) || 
		       URLStreamList.size() >= MAX_STREAMLIST_SIZE) && crawling) {
		    try {
			Thread.sleep(100);
		    } catch(InterruptedException ie) {}

		}
		boolean fromLater = false;
		url = null;
		if(!toDoLater.isEmpty() && 
		   System.currentTimeMillis() - Limiter.LAST_PAUSE_TIME >= 60000) {
		    for(int i=0; i<toDoLater.size(); i++) {
			if(Limiter.fetch(toDoLater.get(i))) {
			    toDo.addAtStart(toDoLater.remove(i));
			}
		    }
		}
		try {
		    Thread.sleep(100);
		} catch(InterruptedException ie) {}

		if(!toDo.isEmpty()) url = toDo.peek();
		if(url != null) {
		    try {
			URL myUrl = new URL(url);
			host = myUrl.getHost();
		    } catch(Exception mue) {
			System.err.println("err: " + url);
			System.err.println(mue);
		    }
		    if(!Limiter.fetch(host)) {
			toDoLater.add(url);
			toDo.next();
		    } else {
			t1 = System.currentTimeMillis();
			stream = WebStream.get(url);
			time = System.currentTimeMillis() - t1;
			if(!hosts.containsKey(host)) {
			    if (host.matches(".*\\.carleton\\.edu.*")) {
				hosts.put(host, new Long(time));
				delay_ms = time;
			    } else {
				delay_ms = 0;
			    }
			}
			else {
			    if(prevhost.equals(host)) {
				delay_ms = ((Long) hosts.get(host)).longValue();
				if(delay_ms != time) {
				    delay_ms = delay_ms + (long)((time - delay_ms) * .5);
				    hosts.put(host, new Long(delay_ms));
				}
			    } else {
				delay_ms = 0;
			    }
			}
			prevhost = host;
			if (stream != null) {
			    if (debug_fetch)
				System.out.println("Fetch: " + url + "  " + delay_ms*2);
			    URLStreamList.add(new URLStream(url, stream, time));
			} else {
			    quarantine.kill(url);
			    if (debug_fetcherror)
				System.out.println("Fetch: Bad URL:" + url + " " + delay_ms*2);
			    fetchErrorCount++;
			}
			String nextHost = "";
			toDo.next();
			try {
			    if(!toDo.isEmpty()) {
				URL myUrl = new URL(toDo.peek());
				nextHost = myUrl.getHost();
			    }
			} catch(Exception mue) {
			    System.err.println(mue);
			}
			if(nextHost.equals(host) && time < delay_ms * 2 && host.matches(".*carleton.*")) {
			    try {
				Thread.sleep(delay_ms * 2 - time);
			    } catch(InterruptedException ie) {}
			}
		    }
		} else {
		    if (!toDo.isEmpty())
			toDo.next();
		}
	    }
	    System.out.println("Fetcher Done");
	}
    }

    private PageRetriever myRetriever;
    //}}}

    //WriteState Stuff {{{
    private static String line_t = "[\n\r\u0085\u2028\u2029]";
    
    private void writeState()
    {
	// write all state data out to file filename in the following order,
	// separated by lines containing only the separator.
	
	// global variables
	// to-visit queue
	// visited queue
	// wordCache (tab-separated key/value pairs)
	// linkCache (tab-separated key/value pairs)
	// checkHash

	try {
	    
	    System.out.println("Writing state...");
	    PrintWriter out = new PrintWriter(new FileWriter(filename));
	
	    System.out.println("\tGlobal variables...");
	    // global variables
	    out.println(indexedCount);
	    out.println(checkCount);
	    out.println(parseErrorCount);
	    out.println(fetchErrorCount);
	    out.println(errorCount);
	    out.println(cacheHits);
	    out.println(cacheMisses);
	    out.println(dbLookups);
	    out.println(lcacheHits);
	    out.println(lcacheMisses);
	    out.println(ldbLookups);

	    System.out.println("\tTo-Visit queue...");
	    // to-visit queue
	    Iterator it = URLStreamList.iterator();
	    while(it.hasNext()) {
		// first put the un-parsed streams back on the to-do list
		URLStream u = (URLStream)it.next();
		out.println(u.url);
	    }
	    it = toDoLater.getIterator();
	    while(it.hasNext()) {
		// now write the to-do later list
		out.println((String)it.next());
	    }
	    it = toDo.getIterator();
	    while(it.hasNext()) {
		// now write the actual to-do list
		out.println((String)it.next());
	    }
	    out.println(separator);

	    System.out.println("\tWordCache...");
	    // wordCache
	    it = wordCache.getIterator();
	    out.println(wordCache.curWID);
	    while(it.hasNext()) {
		String key = (String)it.next();
		Integer value = (Integer)wordCache.wordHash.get(key);
		out.println(key + "\t" + value);
	    }
	    out.println(separator);
	    
	    System.out.println("\tLinkCache...");
	    // linkCache
	    it = linkCache.getIterator();
	    out.println(linkCache.curUID);
	    while(it.hasNext()) {
		String key = (String)it.next();
		Integer value = (Integer)linkCache.linkHash.get(key);
		out.println(key + "\t" + value);
	    }
	    out.println(separator);
	    
	    System.out.println("\tCheckHash...");
	    // checkHash
	    it = checkHash.iterator();
	    while(it.hasNext()) {
		out.println((Long)it.next());
	    }
	    out.close();
	} catch(IOException ioe) {
	    System.err.println("Cannot open file " + filename + " for writing:\n" + ioe);
	}
    } //}}}

    private void readState() //{{{
    {
	try {

	    System.out.println("Reading state...");
	    BufferedReader in = new BufferedReader(new FileReader(filename));
	    String line;

	    System.out.println("\tGlobal variables...");
	    // global variables
	    line = in.readLine();
	    indexedCount = Integer.parseInt(line);
	    line = in.readLine();
	    parseErrorCount = Integer.parseInt(line);
	    line = in.readLine();
	    fetchErrorCount = Integer.parseInt(line);
	    line = in.readLine();
	    checkCount = Integer.parseInt(line);
	    line = in.readLine();
	    errorCount = Integer.parseInt(line);
	    line = in.readLine();
	    cacheHits = Integer.parseInt(line);
	    line = in.readLine();
	    cacheMisses = Integer.parseInt(line);
	    line = in.readLine();
	    dbLookups = Integer.parseInt(line);
	    line = in.readLine();
	    lcacheHits = Integer.parseInt(line);
	    line = in.readLine();
	    lcacheMisses = Integer.parseInt(line);
	    line = in.readLine();
	    ldbLookups = Integer.parseInt(line);
	    
	    System.out.println("\tTo-Visit queue...");
	    // to-visit queue
	    line = in.readLine();
	    while(!line.equals(separator)) {
		toDo.add(line);
		line = in.readLine();
	    }
	    
	    System.out.println("\tWordCache...");
	    // wordCache
	    line = in.readLine();
	    wordCache.setCurWID(Integer.parseInt(line));
	    line = in.readLine();
	    while(!line.equals(separator)) {
		String [] s = line.split("\t");
		wordCache.put(s[0], Integer.parseInt(s[1]));
		line = in.readLine();
	    }
	    
	    System.out.println("\tLinkCache...");
	    // linkCache
	    line = in.readLine();
	    linkCache.setCurUID(Integer.parseInt(line));
	    line = in.readLine();
	    while(!line.equals(separator)) {
		String [] s = line.split("\t");
		if (s.length > 1) {
		    linkCache.put(s[0], Integer.parseInt(s[1]));
		}
		line = in.readLine();
	    }

	    System.out.println("\tCheckHash...");
	    // checkHash
	    line = in.readLine();
	    while(line != null && !line.equals(separator)) {
		checkHash.add(new Long(line));
		line = in.readLine();
	    }
	    
	    in.close();
	} catch (IOException ioe) {
	    System.err.println("Error reading data from " + filename + " :");
	    System.err.println(ioe);
	    System.exit(1);
	}
	catch (SQLException sql) {
	    System.err.println(sql);
	}
    } //}}}

    class Quarantine //{{{
    {
	protected HashMap qHash;
    
	public Quarantine()
	{
	    qHash = new HashMap(100, (float) 1.00);
	}
    
	public boolean isIn(String url)
	{
	    return qHash.containsKey(url);
	}
    
	public void add(Link newLink) 
	{
	    Vector links;
	    if (!qHash.containsKey(newLink.getURL())) {
		links = new Vector();
	    } else {
		links = (Vector) qHash.get(newLink.getURL());
	    }
	    links.add(newLink);
	    qHash.put(newLink.getURL(),links);
	}

	public void release(String url) throws SQLException
	{
	    if (qHash.containsKey(url)) {
		Vector links = (Vector) qHash.get(url);
		Iterator it = links.iterator();
		Link currLink;
		while(it.hasNext()) {
		    currLink = (Link) it.next();
		    linkToDB(currLink);
		}
		qHash.remove(url);
	    }
	}

	public void kill(String url)
	{
	    if (qHash.containsKey(url)) {
		qHash.remove(url);
	    }
	}
    } //}}}

    class WordCache //{{{
    {
	private static final int MAX_HASH_SIZE = 250000;
	private boolean max_size_reached;
	private int size;
	protected HashMap wordHash;
	protected int curWID;
	private String mru;
	
	public WordCache()
	{
	    wordHash = new HashMap(MAX_HASH_SIZE, (float) 1.00);
	    max_size_reached = false;
	    size = 0;
	    curWID = 1;
	    cacheHits = 0;
	    cacheMisses = 0;
	    dbLookups = 0;
	}

	public void setCurWID(int cur)
	{
	    curWID = cur;
	}

	public Iterator getIterator()
	{
	    return wordHash.keySet().iterator();
	}

	private void replace(String key, int value)
	{
	    // most recently used replacement algorithm
	    // wordHash.remove(mru);
	    // 	    wordHash.put(key, new Integer(value));
	}

	public void put(String key, int value) throws SQLException
	{
	    if(max_size_reached) {
		replace(key, value);
	    }
	    else {
		Object test = wordHash.put(key, new Integer(value));
		size++;
		if(size >= MAX_HASH_SIZE) {
		    max_size_reached = true;
		    stmtWordInsert.executeBatch();
		}
	    }
	}

	public int getWID(String key) throws SQLException //{{{
	{
	    ResultSet rs = null;
	    int ret;
	    if(wordHash.containsKey(key)) {
		ret = ((Integer) wordHash.get(key)).intValue();
		cacheHits++;
	    }
	    else if(!max_size_reached) {
		// the word is not in the hash and the hash is not full
		// therefore, the word is not in the database, so add it
		curWID++;
		stmtWordInsert.addBatch("INSERT INTO word (word,wid) VALUES('" + key + "'," + (curWID-1) + ");" );
		put(key, curWID-1);
		ret = curWID - 1;
		cacheMisses++;
	    }
	    else {
		// the hash is full, so we need to check the database
		//rs = stmt.executeQuery("Select * from word where word = '" + key + "';");
		stmtWIDQuery.setString(1, "'"+key+"'");
		rs = stmtWIDQuery.executeQuery();
		rs.last();
		dbLookups++;
		if( rs.getRow() == 0 ) {
		    // the word is not in the database, so insert it
		    stmtWordInsert.addBatch("INSERT INTO word (word,wid) VALUES('"+key+"'," + curWID + ");" );
		    // add the word and wid to the cache, replacing something
		    put(key, curWID);
		    curWID++;
		    ret = curWID - 1;
		}
		else {
		    // the word is in the database, get the WID and add it to the cache
		    int wid = rs.getInt(1);
		    put(key, wid);
		    ret = wid;
		}
	    }
	    mru = key;
	    return ret;
	} //}}}

    } //}}}

    private class LinkCache //{{{
    {
	private int size;
	protected HashMap linkHash;
	protected int curUID;
	
	public LinkCache()
	{
	    linkHash = new HashMap(160000, (float) 1.00);
	    size = 0;
	    curUID = 1;
	    lcacheHits = 0;
	    lcacheMisses = 0;
	    ldbLookups = 0;
	}

	public void setCurUID(int cur)
	{
	    curUID = cur;
	}
    
	public Iterator getIterator()
	{
	    return linkHash.keySet().iterator();
	}

	private void put(String key, int value) 
	{
	    linkHash.put(key, new Integer(value));
	    size++;
	}
	
	public boolean isIn( String key )
	{
	    return linkHash.containsKey( key );
	}
    
	public int getUID(String key) throws SQLException //{{{
	{
	    
	    long t3;
	    dbl = false;
	    ResultSet rs = null;
	    int ret;
	    t3 = System.currentTimeMillis();
	    if(linkHash.containsKey(key)) {
		ret = ((Integer) linkHash.get(key)).intValue();
		lcacheHits++;
		gU1 = System.currentTimeMillis() - t3;
	    }
	    else {
		t3 = System.currentTimeMillis();
		// the url is not in the hash and the hash is not full
		// therefore, the url is not in the database, so add it
		stmtURLInsert.addBatch("INSERT INTO url (url,urlid) VALUES('" + key + "'," + curUID + ");" );
		put(key, curUID);
		curUID++;
		ret = curUID - 1;
		lcacheMisses++;
		gU2 = System.currentTimeMillis() - t3;
	    }
	    return ret;
	} //}}}
    } //}}}

    // Database Insertion functions {{{
    //  wordToDB Stuff {{{
    
    private String [] tagArray = {
	"b","i","u","h1","h2","h3","h4","h5","h6","img","title","meta"
    };

    private void wordToDB(int wid, int urlid, Integer pos, String bintags) throws SQLException
    {
	String tags = "";
	int i = 0;
	for(i=0; i<12; i++)
	    {
		if( bintags.charAt(i) == '1') break;
	    }
	
	if(i<12)
	    {
		tags = tags.concat(tagArray[i]);
		for(int j=i;j<12;j++)
		    {
			if(bintags.charAt(j) == '1')
			    {
				tags = tags.concat(","+tagArray[j]);
			    }
		    }
	    }
	
	
	stmtWord_to_URL.addBatch("Insert into word_to_url (wid,urlid,pos,tag) VALUES("+wid+","+urlid+","+pos+",\""+tags+"\");" );
    } //}}}

    private void linkToDB(Link curLink) throws SQLException // {{{
    {
	Iterator detailink = curLink.getPositions();
	int urlid = linkCache.getUID(curLink.getURL());
	int pageid = linkCache.getUID(curLink.getSourceURL());
	wordsToLinkTable(curLink, urlid, pageid);
	while( detailink.hasNext() ) {
	    Integer pos = (Integer) detailink.next();
	    //System.out.println("INSERT into link: "+urlid+" "+pos+" "+pageid);
	    stmtLink.addBatch("INSERT INTO link (lurl,pos,urlid) VALUES("+urlid+","+pos+","+pageid+");" );
	}
    } // }}}

    private void wordsToLinkTable(Link l, int urlid, int pageid) throws SQLException //{{{
    {
	Iterator wordit = l.getWords();
	while( wordit.hasNext() )
	    {
		String curWordString = (String) wordit.next();
		Word curWord = (Word) l.wordList.get(curWordString);
		int wid = wordCache.getWID(curWordString);
		Iterator detailit = curWord.getDetails();
		while( detailit.hasNext() )
		    {
			Integer pos = (Integer) detailit.next();
			stmtWord_to_Link.addBatch("INSERT INTO word_to_link (lid,pos,urlid,wid) VALUES("+urlid+","+pos+","+pageid+","+wid+");");
			wordToDB(wid, pageid, pos, curWord.posToTags(pos));
		    }
	    }
    } //}}}
    //}}}

    public void crawl( String[] args ) // {{{
    {
	// Variable Declaration for crawl() {{{
	doneCrawling = true;
	String database = new String();
	MyShutdown sh = new MyShutdown();
	Runtime.getRuntime().addShutdownHook(sh);
	startTime = System.currentTimeMillis();
	quiet = false;
	if(args.length > 0) {
	    if(args[0].equals("-q")) {
		quiet = true;
		String [] temp = new String[args.length - 1];
		for(int i=1; i<args.length; i++) {
		    temp[i-1] = args[i];
		}
		args = temp;
	    }
	    for(int i=0; i<args.length; i++) {
		if(args[i].equals("-d")) {
		    if(args.length > i + 1) {
			database = args[i+1];
			String [] temp = new String[args.length - 2];
			for(int j=0; j<i; j++) {
			    temp[j] = args[j];
			}
			for(int j=i+2; j<args.length; j++) {
			    temp[j-2] = args[j];
			}
			args = temp;
			break;
		    }
		    else {
			System.err.println("ERROR: No argument after -d: must specify a database.");
			System.exit(1);
		    }
		}
	    }
	}
	    
	// load the jdbc driver for mysql
	try
	    { 
		Class.forName( "com.mysql.jdbc.Driver" ).newInstance(); 
	    }
	catch( Exception ex )
	    { 
		// handle the error
		System.err.println(ex);
		System.exit(1);
	    }
	    
	toDo = new PagesToVisit();
	toDoLater = new PagesToVisit();
	Page p;
	String curUrl = new String();
	Word curWord;
	Link curLink;
	int wid, urlid, lid;
	String bintags = new String();
	Integer pos;
	    
	indexedCount = 0;
	parseErrorCount = 0;
	fetchErrorCount = 0;
	checkCount = 0;
	errorCount = 0;
	SQLErrorCount = 0;
	noFollowCount = 0;

	    
	checkHash = new HashSet();
	// }}}

	// Initialization for crawl() {{{
	try
	    {
		// open a connection to mysql
		conn = DriverManager.getConnection("jdbc:mysql://violet.mathcs.carleton.edu/" + database,"webcrawler","twlv34-1");
	    }
	catch (SQLException ex) // {{{
	    {
		System.err.println("SQLException: " + ex.getMessage() );
		System.err.println("SQLState: " + ex.getSQLState() );
		System.err.println("VenderError: " + ex.getErrorCode() );
		System.exit(1);
	    } // }}}
	    
	prepareStmts();

	wordCache = new WordCache();
	linkCache = new LinkCache();
	quarantine = new Quarantine();
	// check the command line for a site to start at
	// default to starting at mathcs
	// be sure to add any sites to the hash to avoid duplicates
	    

	if(args.length == 0) {
	    String [] mainservers = new String[8];
	    mainservers[0] = "http://www.carleton.edu";
	    mainservers[1] = "http://www.mathcs.carleton.edu";
	    mainservers[2] = "http://gridley.res.carleton.edu";
	    mainservers[3] = "http://www.student.carleton.edu";
	    mainservers[4] = "http://csa.carleton.edu";
	    mainservers[5] = "http://www.acad.carleton.edu";
	    mainservers[6] = "http://serc.carleton.edu";
	    mainservers[7] = "http://webapps.acs.carleton.edu/sitemap/";
	    for (int i=0;i < mainservers.length; i++) {
		toDo.add(mainservers[i]);
	    }
	} else {
	    if(args[0].equals("--resume") || args[0].equals("-r")) {
		readState();
	    } else {
		for(int i=0; i<args.length; i++) {
		    toDo.add(args[i]);
		}
	    }
	}
	crawling = true;
	boolean firstrun = true;
	boolean error = false;
	doneCrawling = false;
	myRetriever = new PageRetriever();
	myRetriever.start();

	mySystemState = new ShowSystemState();
	mySystemState.start();
	
	// }}}
	    
	// Active Crawling {{{
	while( ((!toDo.isEmpty() || 
	         !toDoLater.isEmpty() ||
	         !URLStreamList.isEmpty()) &&
		 crawling) || 
	 	 firstrun) {
	    
	    firstrun = false;
	    error = false;
	    long temptime = System.currentTimeMillis();
	    // get the next url from the queue and parse it
	    long t1, t2;
	    long fetchTime=0;
	    long parseTime=0;
	    long insertTime=0;
	    boolean paused = false;
		
		// Grab a download stream {{{	
		while(URLStreamList.isEmpty() && crawling ) {
		    if (URLStreamList.isEmpty() && toDo.isEmpty() && toDoLater.isEmpty())
			crawling = false;
		    
		    if (paused == false && URLStreamList.isEmpty() && toDo.isEmpty()) {
			try {
			    commitStmts();
			} catch (SQLException sqe) {
			}
			paused = true;
		    }
		    
		    try {
			Thread.sleep(500);
		    } catch (InterruptedException ie) {}
		}
		
		if(URLStreamList.isEmpty()) break;
		URLStream urlStream = (URLStream) URLStreamList.remove(0); 
		curUrl = urlStream.url;
		InputStream stream = urlStream.stream;
		fetchTime = urlStream.fetchTime;
		
		// }}}
		
	    try {
		// Parse it {{{
		if(stream != null) {
		    t1 = System.currentTimeMillis();
		    System.out.println(curUrl);
		    
		    ParserThread pthread = new ParserThread(curUrl, stream);
		    pthread.start();
		    
		    while (pthread.isAlive()) {
			if (System.currentTimeMillis() - t1 > PARSETIMEOUT) {
			    pthread.interrupt();
			    System.out.println("Parser Timeout: "+curUrl);
			    break;
			}
			try {
			    Thread.sleep(10);
			} catch (InterruptedException ie) {
			}
		    }
		    p = pthread.page;
		    parseTime = System.currentTimeMillis() - t1;
		}
		else p = null;	
		// }}}

		// Process the page object {{{
		if( p != null && !checkHash.contains(new Long(p.checkSum))) {
		    
		    quarantine.release(p.getURL());
		    
		    if (debug_indexing) {
			if(!Limiter.index(curUrl)) {
			    System.out.print("Not indexing: ");
			}
			if(!Limiter.follow(curUrl)) {
			    System.out.print("Not following: ");
			}
		    }
		    t1 = System.currentTimeMillis();
		    checkHash.add(new Long(p.checkSum));
		    indexedCount++;
		    if( p.index ) {
			urlid = linkCache.getUID(p.getURL());
			//grab text and header info
			
			try {
			    stmtUrl_to_Text.setInt(1, urlid);
			    stmtUrl_to_Text.setString(2,p.getHeader());
			    stmtUrl_to_Text.setString(3, p.getText());
			    stmtUrl_to_Text.executeUpdate();
			} catch (SQLException exer) {
			    System.err.println("SQLException: " + exer.getMessage() );
			    System.err.println("SQLState: " + exer.getSQLState() );
			    System.err.println("VenderError: " + exer.getErrorCode() );		
			// open a connection to mysql
			    conn = DriverManager.getConnection("jdbc:mysql://violet.mathcs.carleton.edu/" + database,"webcrawler","twlv34-1");
			    prepareStmts();
			    
			    stmtUrl_to_Text.setInt(1, urlid);
			    stmtUrl_to_Text.setString(2,p.getHeader());
			    stmtUrl_to_Text.setString(3, p.getText());
			    stmtUrl_to_Text.executeUpdate();
			}
			
			// go through the words on the page
			Iterator wordit = p.getWords();
			while( wordit.hasNext() ) {
			    curWord = p.keyToWord( (String) wordit.next() );
			    tempWord = curWord.getWord();
			    wid = wordCache.getWID(tempWord);
			    // get the info for each word-occurence
			    Iterator detailit = curWord.getDetails();
			    while( detailit.hasNext() ) {
				// get the position and the tags for
				// the appearance of the word
				pos = (Integer) detailit.next();
				bintags = curWord.posToTags( pos );
				// add the word to the database
				wordToDB(wid, urlid, pos, bintags);
			    }
			}
			    
		    }

		    // add the links on the page to the queue and the hash
		    Iterator it = p.getLinks();
		    while( it.hasNext() ) {
			String s = (String) it.next();
			if (p.follow && (Limiter.index(s) || Limiter.follow(s))) {
			    if( !linkCache.isIn( s ) ) {
				if(!quarantine.isIn(s)) {
				    toDo.add( s );
				}
				quarantine.add(p.keyToLink(s));
			    } else {
				linkToDB(p.keyToLink(s));
			    }
			} else {
			    noFollowCount++;
			}
		    }

		    insertTime = System.currentTimeMillis() - t1;
		} else {
		    quarantine.kill(curUrl);
		    if(p != null) {
			if (debug_checksum)
			    System.err.println( "   CheckSum Collision: " + curUrl );
			checkCount++;
		    } else if(stream != null) {
			if (debug_parseerror)
			    System.err.println( "   Could not parse: " + curUrl );
			parseErrorCount++;
		    } else {
			if (debug_unknown)
			    System.err.println( "   Unknown Error: " + curUrl);
			errorCount++;
		    }
		} // }}}
		if (debug_inserttimes)
		    System.out.println("I: "+insertTime+" ms dbl: "+dbl);
		

	    } catch(BatchUpdateException bue) { // {{{
		    SQLErrorCount++;
		    System.err.println(bue);
		    System.err.println("SQLException: " + bue.getMessage() );
		    System.err.println("SQLState: " + bue.getSQLState() );
		    System.err.println("VenderError: " + bue.getErrorCode() );
		    System.err.println("curUrl: " + curUrl );
		    System.err.println("curWord: " + tempWord );
		    bue.printStackTrace(System.err);
	    } catch (SQLException ex) {
		    SQLErrorCount++;
		    System.err.println("SQLException: " + ex.getMessage() );
		    System.err.println("SQLState: " + ex.getSQLState() );
		    System.err.println("VenderError: " + ex.getErrorCode() );
		    System.err.println("curUrl: " + curUrl );
		    System.err.println("curWord: " + tempWord );
		    ex.printStackTrace(System.err);
	    } // }}}
		    
	} // }}}

	
	System.out.println("Indexer Done");
	// Start Shutdown sequence after a crawl {{{
	crawling = false;

	mySystemState.interrupt();
	
	if( rs != null ) {
	    try {
		rs.close();
	    } catch (SQLException sqlEx) {}
	    rs = null;
	}
	if( stmt != null ) {
	    try {
		stmt.close();
	    } catch (SQLException sqlEx) {}
	    stmt = null;
	}
	try {
	    commitStmts();
	} catch(SQLException bue) {
	    System.err.println(bue);
	}
	doneCrawling = true;
	//}}}
    } // }}}

    private void printInfo() { //{{{
	int hours;
	int minutes;
	int seconds;
	double elapsedTime = System.currentTimeMillis() - startTime;
	double totalTime = elapsedTime;
	hours = (int) Math.floor(elapsedTime / 3600000);
	elapsedTime = elapsedTime - hours * 3600000;
	minutes = (int) Math.floor(elapsedTime / 60000);
	elapsedTime = elapsedTime - minutes * 60000;
	seconds = (int) Math.floor(elapsedTime / 1000);
	
	DecimalFormat myFormatter = new DecimalFormat("##");
	DecimalFormat myFormatter2 = new DecimalFormat("###");
	
	System.out.println(indexedCount + " pages crawled.");
	System.out.println(fetchErrorCount + " pages could not be fetched.");
	System.out.println(checkCount + " pages had checksum collisions.");
	System.out.println(parseErrorCount + " pages could not be parsed.");
	System.out.println(errorCount + " pages had unknown errors.");
	System.out.println(noFollowCount + " links not followed."); 
	if(!toDo.isEmpty()) System.out.println((toDo.size() + toDoLater.size() + URLStreamList.size()) + " links remaining on the queue.");
	System.out.println(myFormatter2.format(hours) + " h " + 
			   myFormatter.format(minutes) + " m " + 
			   myFormatter.format(seconds) + " s");
	System.out.println("Avg of " + Math.round(totalTime / indexedCount) + " ms per page.");
	System.out.println("\n" + cacheHits + " cache hits.");
	System.out.println(cacheMisses + " cache misses.");
	System.out.println(dbLookups + " db lookups.");

	System.out.println("\n" + lcacheHits + " link cache hits.");
	System.out.println(lcacheMisses + " link cache misses.");
	System.out.println(ldbLookups + " link db lookups.");
	int total = cacheHits + cacheMisses + dbLookups;
	System.out.println("Total: " + total);
    } //}}}

    public static void main( String[] args ) 
    {
	Crawler c = new Crawler();
	c.crawl(args);
    }
} // }}}
