/* Use, modification, and distribution are subject to the Boost Software License, Version 1.0. (See accompanying file LICENSE_1.0.txt or copy at www.boost.org/LICENSE_1.0.txt) */ // Parser Class // This class contains the main page parsing function, // which takes a URL, parses it, and returns a Page object // containing page information. import java.io.*; import java.net.*; import java.util.regex.*; import java.util.zip.*; class Parser { private int [] tags; private int [] tempTags; private int position; private URL url; private Page page; private boolean ignore; private String curURL; private boolean comment; private String parseWords(String sbuf, int eob) // {{{ // splits sbuf on whitespace and inserts splits into page object // returns the last split item { if(!ignore) { sbuf = sbuf.replaceAll("\\s+"," "); String tbuf = sbuf; sbuf = sbuf.toLowerCase(); String [] words = sbuf.split("&\\S*;|[\\W&&[^\\-']]", -1); String tagstring = ""; for(int i=0; i 0) tagstring = tagstring + "1"; else tagstring = tagstring + "0"; } Link l = null; if(curURL != null) { l = (Link)(page.linkList.get(curURL)); } if (eob == 1) { tbuf = tbuf.substring(0,(tbuf.length() - words[words.length-1].length())); } tbuf = tbuf.replaceAll("'","\\\\'"); if (!tbuf.equals("") && !tbuf.equals(" ")) { if (tags[10] > 0) { page.addHeader(tbuf); } else if (tags[11] == 0) { page.addText(tbuf); } } for(int i=0; i 0) { // if the word is in a title, insert it no matter what page.addWord(words[i], position, tagstring); } else if(words[i].matches("[A-Za-z].*")) { // if the word is in the body, make sure it begins with a // letter page.addWord(words[i], position, tagstring); } } position++; } } return words[words.length-1]; } else return ""; } // }}} private String absURL(String link) // {{{ { link = link.replaceAll("\\s+","%20"); link = link.replaceAll("'","\\\\'"); URL abs_url; try { abs_url = new URL(url, link); } catch (MalformedURLException mue) { return null; } String absLink = abs_url.toString(); absLink = absLink.replaceFirst("#[^\\?]*", ""); absLink = absLink.replaceFirst("&","&"); if(!absLink.endsWith("/") && !absLink.matches(".*[^/]*\\.[^/]*") && !absLink.matches(".*?.*")) absLink = absLink+"/"; return absLink; } // }}} private void handleTag(String tag, int tag_change) // {{{ { // make changes to the tags array based on what tag appears tag = tag.toLowerCase(); // tags to ignore the text of if(tag.equals("script") || tag.equals("style")) { if(tag_change == 1) ignore = true; else ignore = false; } else if(tag.startsWith("!--")) { comment = true; } else if(tag.equals("b") || tag.equals("big") || tag.equals("strong")) { tags[0] = tags[0] + tag_change; if(tags[0] < 0) tags[0] = 0; } else if(tag.equals("i") || tag.equals("em")) { tags[1] = tags[1] + tag_change; if(tags[1] < 0) tags[1] = 0; } else if(tag.equals("u")) { tags[2] = tags[2] + tag_change; if(tags[2] < 0) tags[2] = 0; } else if(tag.equals("h1")) { tags[3] = tags[3] + tag_change; if(tags[3] < 0) tags[3] = 0; } else if(tag.equals("h2")) { tags[4] = tags[4] + tag_change; if(tags[4] < 0) tags[4] = 0; } else if(tag.equals("h3")) { tags[5] = tags[5] + tag_change; if(tags[5] < 0) tags[5] = 0; } else if(tag.equals("h4")) { tags[6] = tags[6] + tag_change; if(tags[6] < 0) tags[6] = 0; } else if(tag.equals("h5")) { tags[7] = tags[7] + tag_change; if(tags[7] < 0) tags[7] = 0; } else if(tag.equals("h6")) { tags[8] = tags[8] + tag_change; if(tags[8] < 0) tags[8] = 0; } else if(tag.equals("title")) { tags[10] = tags[10] + tag_change; if(tags[10] < 0) tags[10] = 0; } else if(tag_change == 1 && (tag.equals("col") || tag.equals("colgroup") || tag.equals("caption") || tag.equals("td") || tag.equals("tr") || tag.equals("table") || tag.equals("tbody") || tag.equals("tfoot") || tag.equals("th") || tag.equals("thead"))) { for(int i=0; i"); Pattern pEndComment = Pattern.compile("-->"); try { page.setNoFollow(!Limiter.follow(pageURL)); page.setNoIndex(!Limiter.index(pageURL)); // open the connection url = new URL(pageURL); datastream = new CheckedInputStream(stream, new CRC32()); while(true) { if(read_more) { numRead = datastream.read(bbuf, 0, BUFFSIZE); if(numRead != -1) { for(int i=0; i= 1) parseWords(sbuf.substring(0, m.start()),0); sbuf = sbuf.substring(m.start()+1); // set in_tag flag to true in_tag = true; } } else { if(!comment) m = pEndTag.matcher(sbuf); else m = pEndComment.matcher(sbuf); if(!m.find()) { // set temp to whatever is left and read_more to true temp = sbuf; read_more = true; } else { if(!comment) { // parse tag and remove it from sbuf // add tag info to tags string parseTag(sbuf.substring(0, m.start())); sbuf = sbuf.substring(m.start()+1); // set in_tag to false if(!comment) in_tag = false; } else { comment = false; sbuf = sbuf.substring(m.start()+3); in_tag = false; } } } } if(stream != null) stream.close(); Pattern redirect = Pattern.compile("\\s*\\d+;\\sURL=(\\S+)\\s*\\Z"); m = redirect.matcher(page.getText()); if (m.matches()) { page.setNoIndex(true); page.addLink(m.group(1),1); } } catch(IOException ioe) { return null; } return page; } /// }}} public static Page parse(String pageURL, InputStream stream) { Parser p = new Parser(); return p.myParse(pageURL, stream); } }