/* Use, modification, and distribution are subject to the Boost Software 
License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at 
www.boost.org/LICENSE_1.0.txt) */

// Parser Class
// This class contains the main page parsing function,
// which takes a URL, parses it, and returns a Page object
// containing page information.

import java.io.*;
import java.net.*;
import java.util.regex.*;
import java.util.zip.*;

class Parser
{
    private int [] tags;
    private int [] tempTags;
    private int position;
    private URL url;
    private Page page;
    private boolean ignore;
    private String curURL;
    private boolean comment;
    
    private String parseWords(String sbuf, int eob) // {{{
	// splits sbuf on whitespace and inserts splits into page object
	// returns the last split item
    {
	if(!ignore) {
	    sbuf = sbuf.replaceAll("\\s+"," ");
	    String tbuf = sbuf;
	    sbuf = sbuf.toLowerCase();
	    String [] words = sbuf.split("&\\S*;|[\\W&&[^\\-']]", -1);
	    String tagstring = "";
	    for(int i=0; i<tags.length; i++) {
		if(tags[i] > 0) tagstring = tagstring + "1";
		else tagstring = tagstring + "0";
	    }
	    Link l = null;
	    if(curURL != null) {
		l = (Link)(page.linkList.get(curURL));
	    }
	    
	    if (eob == 1) {
		tbuf = tbuf.substring(0,(tbuf.length() - words[words.length-1].length()));
	    }
	    tbuf = tbuf.replaceAll("'","\\\\'");
	    if (!tbuf.equals("") && !tbuf.equals(" ")) {
		if (tags[10] > 0) {
		    page.addHeader(tbuf);
		} else if (tags[11] == 0) {
		    page.addText(tbuf);
		}
	    }
	    
	    for(int i=0; i<words.length - eob; i++) {
		if(!(words[i].equals("") || 
		     words[i].equals("-") ||
		     words[i].equals("'"))) {
		    words[i] = words[i].replaceAll("'","\\\\'");
		    if(l != null) {
		        // if curURL does not equal null, add word to associated link
		        l.addWord(words[i], position, tagstring);
		    } else {
			// insert words into page object with tags attribute
			// if words[i] is not null, insert it and increment position
			if(tags[10] > 0) {
			    // if the word is in a title, insert it no matter what
			    page.addWord(words[i], position, tagstring);
			} else if(words[i].matches("[A-Za-z].*")) {
			    // if the word is in the body, make sure it begins with a 
			    // letter
			    page.addWord(words[i], position, tagstring);
			}
		    }
		    position++;
		}
	    }
	    return words[words.length-1];
	}
	else return "";
    } // }}}

    private String absURL(String link) // {{{
    {
	link = link.replaceAll("\\s+","%20");
	link = link.replaceAll("'","\\\\'");
	URL abs_url;
	try {
	    abs_url = new URL(url, link);
	} catch (MalformedURLException mue) {
	    return null;
	}
	String absLink = abs_url.toString();
	absLink = absLink.replaceFirst("#[^\\?]*", "");
	absLink = absLink.replaceFirst("&amp;","&");
	if(!absLink.endsWith("/") && 
	   !absLink.matches(".*[^/]*\\.[^/]*") && 
	   !absLink.matches(".*?.*"))
	    absLink = absLink+"/";
	return absLink;
    } // }}}
    
    private void handleTag(String tag, int tag_change) // {{{
    {
	// make changes to the tags array based on what tag appears
	tag = tag.toLowerCase();
	// tags to ignore the text of
	if(tag.equals("script") || tag.equals("style")) {
	    if(tag_change == 1) ignore = true;
	    else ignore = false;
	}
	else if(tag.startsWith("!--")) {
	    comment = true;
	}
	else if(tag.equals("b") || tag.equals("big") ||
		tag.equals("strong")) {
	    tags[0] = tags[0] + tag_change;
	    if(tags[0] < 0) tags[0] = 0;
	}
	else if(tag.equals("i") || tag.equals("em")) {
	    tags[1] = tags[1] + tag_change;
	    if(tags[1] < 0) tags[1] = 0;
	}
	else if(tag.equals("u")) {
	    tags[2] = tags[2] + tag_change;
	    if(tags[2] < 0) tags[2] = 0;
	}
	else if(tag.equals("h1")) {
	    tags[3] = tags[3] + tag_change;
	    if(tags[3] < 0) tags[3] = 0;
	}
	else if(tag.equals("h2")) {
	    tags[4] = tags[4] + tag_change;
	    if(tags[4] < 0) tags[4] = 0;
	}
	else if(tag.equals("h3")) {
	    tags[5] = tags[5] + tag_change;
	    if(tags[5] < 0) tags[5] = 0;
	}
	else if(tag.equals("h4")) {
	    tags[6] = tags[6] + tag_change;
	    if(tags[6] < 0) tags[6] = 0;
	}
	else if(tag.equals("h5")) {
	    tags[7] = tags[7] + tag_change;
	    if(tags[7] < 0) tags[7] = 0;
	}
	else if(tag.equals("h6")) {
	    tags[8] = tags[8] + tag_change;
	    if(tags[8] < 0) tags[8] = 0;
	}
	else if(tag.equals("title")) {
	    tags[10] = tags[10] + tag_change;
	    if(tags[10] < 0) tags[10] = 0;
	}
	else if(tag_change == 1 && 
		(tag.equals("col") || tag.equals("colgroup") ||
		 tag.equals("caption") || tag.equals("td") ||
		 tag.equals("tr") || tag.equals("table") ||
		 tag.equals("tbody") || tag.equals("tfoot") ||
		 tag.equals("th") || tag.equals("thead")))
	    {
		for(int i=0; i<tags.length; i++) {
		    tags[i] = 0;
		}
	    }
    } // }}}

    private void parseTag(String sbuf) // {{{
	// parse img and link tags and insert info into page object
	// modify tags string
    {
	// matches the tag type and whether or not it is a closing tag
	String p1 = "(?i)(/)?\\s*((a|img|meta)|(\\S+))";
	// matches the attributes of the tag if it is a link or image
	String p2 = "(?i)((href)|(alt)|(content))\\s*=\\s*(([\"]\\s*([^\"]*)\\s*[\"])|([\']\\s*([^\']*)\\s*[\'])|(\\S*))";
	Pattern pTag = Pattern.compile(p1);
	Matcher tagMatcher = pTag.matcher(sbuf);
	int tag_change;
	if(tagMatcher.find()) {
	    // check to see if the tag is an ending tag or not
	    if(tagMatcher.group(1) == null) tag_change = 1;
	    else tag_change = -1;
	    if(tagMatcher.group(3) != null) {
		// tag is either a img, meta, or a
		if(tag_change == -1) {
		    // only a tags have closings
		    curURL = null;
		}
		else {
		    //System.out.println("Match:" + sbuf);
		    Matcher attrMatcher = Pattern.compile(p2).matcher(sbuf);
		    if(attrMatcher.find()) {
			if(attrMatcher.group(2) != null) {
			    // this is a link...the link is either in group 7,9, or 10 or 11 (but only if your dumb)
			    // make this link absolute and add it to the page
			    String link;
			    if(attrMatcher.group(7) != null)
				link = attrMatcher.group(7);
			    else if (attrMatcher.group(9) != null)
				link = attrMatcher.group(9);
			    else
				link = attrMatcher.group(10);
			    if (!link.matches(".*mailto.*")) {
				link = absURL(link);
				if(link != null) {
				    page.addLink(link, position);
				    curURL = link;
				}
			    }
			}
			else if(attrMatcher.group(3) != null) {
			    // this is alt text from an image
			    // call parseWords function
			    String alt;
			    tags[9] = 1;
			    if(attrMatcher.group(7) != null)
				alt = attrMatcher.group(7);
			    else if (attrMatcher.group(9) != null)
				alt = attrMatcher.group(9);
			    else
				alt = attrMatcher.group(10);
			    parseWords(alt, 0);
			    tags[9] = 0;
			}
			else {
			    // this is meta text
			    // call parseWords function
			    Matcher robotMatcher = 
			    Pattern.compile("(?i)(name)\\s*=\\s*([\'\"]?\\s*robots\\s*[\'\"]?)").matcher(sbuf);
			    if (robotMatcher.find()) {
				if ((attrMatcher.group(7) != null &&
				     attrMatcher.group(7).matches("(?i).*noindex.*")) ||
				    (attrMatcher.group(9) != null &&
				     attrMatcher.group(9).matches("(?i).*noindex.*")) ||
				    (attrMatcher.group(10) != null &&
				     attrMatcher.group(10).matches("(?i).*noindex.*"))) {
				    page.setNoIndex(true);
				}
				if ((attrMatcher.group(7) != null &&
				     attrMatcher.group(7).matches("(?i).*nofollow.*")) ||
				    (attrMatcher.group(9) != null &&
				     attrMatcher.group(9).matches("(?i).*nofollow.*")) ||
				    (attrMatcher.group(10) != null &&
				     attrMatcher.group(10).matches("(?i).*nofollow.*"))) {
				    page.setNoFollow(true);
				}
			    } else {
				String meta;
				tags[11] = 1;
				if(attrMatcher.group(7) != null)
				    meta = attrMatcher.group(7);
				else if (attrMatcher.group(9) != null)
				    meta = attrMatcher.group(9);
				else
				    meta = attrMatcher.group(10);
				parseWords(meta, 0);
				tags[11] = 0;
			    }
			}
		    }
		}
	    }
	    else {
		// tag of a different sort, so add it to the tags array
		// check to see if it's a script tag...if so, set ignore flag
		// tag is contained in tagMatcher.group(3)
		handleTag(tagMatcher.group(4), tag_change);
		if(comment) {
		    if(sbuf.endsWith("--")) {
			comment = false;
		    }
		}
	    }
	}
    } //}}}
    
    public Page myParse(String pageURL, InputStream stream) // {{{
    {
	if(!Limiter.parse(pageURL))
	    return null;
	if(stream == null) 
	    return null;
	position = 0;
	comment = false;
	curURL = null;
	final int BUFFSIZE = 600;
	page = new Page(pageURL);
	CheckedInputStream datastream;
	String sbuf = new String("");
	String tempbuf;
	char [] cbuf = new char[BUFFSIZE];
	byte [] bbuf = new byte[BUFFSIZE];
	String temp = "";
	Matcher m;
	boolean in_tag = false;
	boolean read_more = true;
	int numRead;

	// initialize tags array
	tags = new int[12];
	for(int i=0; i<tags.length; i++) {
	    tags[i] = 0;
	}
	Pattern pBeginTag = Pattern.compile("<");
	Pattern pEndTag = Pattern.compile(">");
	Pattern pEndComment = Pattern.compile("-->");
	try {
	    page.setNoFollow(!Limiter.follow(pageURL));
	    page.setNoIndex(!Limiter.index(pageURL));
	    // open the connection
	    url = new URL(pageURL);
	    datastream = new CheckedInputStream(stream, new CRC32());
	    while(true) {
		if(read_more) {
		    numRead = datastream.read(bbuf, 0, BUFFSIZE);
		    if(numRead != -1) {
			for(int i=0; i<numRead; i++) {
			    cbuf[i] = (char)bbuf[i];
			}
			sbuf = temp.concat(new String(cbuf, 0, numRead));
		    }
		    else {
			page.setCheckSum(datastream.getChecksum().getValue());
			break;
		    }
		    read_more = false;
		}
		if(!in_tag) {
		    m = pBeginTag.matcher(sbuf);
		    if(!m.find()) {
			// hasn't reached beginning of tag, so parse what we
			// have and then read more
			temp = parseWords(sbuf, 1);
			// set temp to whatever we have left and 
			// read_more to true
			read_more = true;
		    }
		    else {
			// parse everything up to the < using current tag info
			// and remove it from sbuf
			if(m.start() >= 1)
			    parseWords(sbuf.substring(0, m.start()),0);
			sbuf = sbuf.substring(m.start()+1);
			// set in_tag flag to true
			in_tag = true;
		    }
		} else {
		    if(!comment)
			m = pEndTag.matcher(sbuf);
		    else
			m = pEndComment.matcher(sbuf);
		    if(!m.find()) {
			// set temp to whatever is left and read_more to true
			temp = sbuf;
			read_more = true;
		    }
		    else {
			if(!comment) {
			    // parse tag and remove it from sbuf
			    // add tag info to tags string
			    parseTag(sbuf.substring(0, m.start()));
			    sbuf = sbuf.substring(m.start()+1);
			    // set in_tag to false
			    if(!comment)
				in_tag = false;
			}
			else {
			    comment = false;
			    sbuf = sbuf.substring(m.start()+3);
			    in_tag = false;
			}
		    }
		}
	    }
	    if(stream != null) stream.close();
	    Pattern redirect = Pattern.compile("\\s*\\d+;\\sURL=(\\S+)\\s*\\Z");
	    m = redirect.matcher(page.getText());
	    if (m.matches()) {
		page.setNoIndex(true);
		page.addLink(m.group(1),1);
	    }
	}
	catch(IOException ioe) {
	    return null;
	}
	return page;
    } /// }}}
    public static Page parse(String pageURL, InputStream stream) {
	Parser p = new Parser();
	return p.myParse(pageURL, stream);
    }
}

