Question: Please find the problems with this code. //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ // File: cnnCrawler.java // // This code looks at the CNN website and follows some links to
Please find the problems with this code.
//**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ // File: cnnCrawler.java // // This code looks at the CNN website and follows some links to get info on articles that I want more // info on. // All output is written in the working directory to: cnnCrawlerOutput.txt //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ import gnu.regexp.*; import java.net.*; import java.io.*; public class cnnCrawler{ public static void main(String[] args) { StringBuffer basePage = new StringBuffer(); // Connect to CNN and get the document basePage = getBasePageContents("http://www.cnn.com"); // Look at the area of interest (The "MORE FROM CNN" section) basePage = initialIsolateBasePageContents(basePage); // Pull all of the URLs out basePage = getInfo(basePage, " ]*|/b>]*"); basePage = getInfo(basePage, "\"/[^(\")]*"); basePage = getInfo(basePage,"\"[^&]*"); // Go to the URLs and pull out the information of interest and // write to file. goToURLs(basePage); } //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ // Method: getBasePageContents // // This method opens a connection to the webpage we are interested in and stores // all of the text on the page //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ public static StringBuffer getBasePageContents(String myURL){ try{ // Set base document to CNN, open connection, // and copy the source text into a buffer URL cnnBaseDoc = new URL(myURL); cnnBaseDoc.openConnection(); BufferedReader cnnBaseBuffer = new BufferedReader( new InputStreamReader( cnnBaseDoc.openStream())); String cnnBaseInputLine; StringBuffer tempDocument = new StringBuffer(); while ((cnnBaseInputLine = cnnBaseBuffer.readLine()) != null){ tempDocument.append(cnnBaseInputLine); } cnnBaseBuffer.close(); return(tempDocument); } catch(MalformedURLException e) { System.out.println("Unable to create URL object"); return(null); } catch(IOException e){ System.out.println("Unable to open URL"); return(null); } } //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ // Method: initialIsolateBasePageContents // // This method isolates us to store only the section we are interest in -- // the "MORE FROM CNN" section // //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~
public static StringBuffer initialIsolateBasePageContents(StringBuffer basePage){ try{ RE document = new RE(basePage); // Define the left and right isolators String sLeft = new String("MORE FROM CNN[//w//W]*"); RE leftCntxt = new RE(sLeft); RE rightCntxt= new RE(">SPORTS"); StringBuffer sLIsolator = new StringBuffer(""); int iLIsolatorIndex = 0; RE regLIsolator = new RE(leftCntxt); REMatch ctxtLMatch = regLIsolator.getMatch(basePage); sLIsolator.append(ctxtLMatch.toString()); iLIsolatorIndex = ctxtLMatch.getStartIndex(); // Find the Right Isolator StringBuffer sRIsolator = new StringBuffer(); RE regRIsolator = new RE(rightCntxt); int iRIsolatorIndex = 0; REMatch ctxtRMatch = regRIsolator.getMatch(basePage); sRIsolator.append(ctxtRMatch.toString()); iRIsolatorIndex = ctxtRMatch.getStartIndex(); basePage.delete(iRIsolatorIndex, basePage.length()); basePage.delete(0, iLIsolatorIndex); return(basePage); } catch(REException e){ System.out.println("RE Exception"); return(null); } } //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ // Method: getInfo // // This method applies the specified regular expression to the string passed in //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ public static StringBuffer getInfo(StringBuffer textToSearch, String regExp){ try{ StringBuffer sIsolated = new StringBuffer(""); int iLIsolatorIndex = 0; String sLeft = new String(regExp); RE leftCntxt = new RE(sLeft); RE regLIsolator = new RE(leftCntxt); REMatchEnumeration ctxtLMatch = regLIsolator.getMatchEnumeration(textToSearch); while (ctxtLMatch.hasMoreMatches()){ sIsolated.append(ctxtLMatch.nextMatch().toString()); sIsolated.append(" "); } return(sIsolated); } catch(REException e){ System.out.println("RE Exception"); return(null); } } public static void goToURLs(StringBuffer textToSearch) { try{ StringBuffer interestingDoc = new StringBuffer(""); StringBuffer sInfoForFile = new StringBuffer(""); int numPage=0; FileOutputStream fCnnOut; PrintStream pCnnOut; String sLeft = new String("/[^\"]*"); RE leftCntxt = new RE(sLeft); String sIsolated = new String(); int iLIsolatorIndex = 0; RE regLIsolator = new RE(leftCntxt); REMatchEnumeration ctxtLMatch = regLIsolator.getMatchEnumeration(textToSearch); fCnnOut = new FileOutputStream("cnnCrawlerOutput.txt"); pCnnOut = new PrintStream(fCnnOut); while (ctxtLMatch.hasMoreMatches()) { numPage++; sIsolated = "http://www.cnn.com"; sIsolated += (ctxtLMatch.nextMatch().toString()); interestingDoc = connectToURLs(sIsolated); sInfoForFile = getDocInfo(interestingDoc, sIsolated, numPage); pCnnOut.println (sInfoForFile); } pCnnOut.close(); System.out.println("You may view the output in file: cnnCrawlerOutput.txt."); } catch(REException e){ System.out.println("RE Exception"); } catch (Exception e) { System.out.println ("Error writing file."); } } //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ // Method: connectToURLs // This method opens a URL and returns the text of the page //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ public static StringBuffer connectToURLs(String urlText){ try{ URL cnnBaseDoc = new URL(urlText); cnnBaseDoc.openConnection(); BufferedReader cnnBaseBuffer = new BufferedReader( new InputStreamReader( cnnBaseDoc.openStream())); String cnnBaseInputLine; StringBuffer tempDocument = new StringBuffer(); while ((cnnBaseInputLine = cnnBaseBuffer.readLine()) != null){ tempDocument.append(cnnBaseInputLine); } cnnBaseBuffer.close(); return(tempDocument); } catch(MalformedURLException e) { System.out.println("Unable to create URL object"); return(null); } catch(IOException e){ System.out.println("Unable to open URL"); return(null); } } //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ // Method: getDocInfo // // This method returns the interesting information that we were asked to parse out // including: Date, Place, Headline, URL, and First paragraph. //**~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~ public static StringBuffer getDocInfo(StringBuffer doc, String URL, int ID){ StringBuffer importantInfoToReturn = new StringBuffer(""); StringBuffer Headline = new StringBuffer(""); StringBuffer Date = new StringBuffer(""); StringBuffer Place = new StringBuffer(""); StringBuffer FirstParagraph = new StringBuffer(""); URL = URL.substring(0, (URL.length()-1)); Date.append(getInfo(doc, "name=\"DATE\" content=\"[^>]*")); if(Date.length() > 0){ Date.delete(0,21); Date.delete((Date.length()-1), Date.length()); } else{ Date.append("No date Reported."); } Place.append(getInfo(doc, "
[^(
)]*|
[^-]*")); if(Place.length() > 0){ Place.delete(0,6); } else{ Place.append("No location Reported."); } Headline.append(getInfo(doc, "
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
