LinuxQuestions.org - java tag reader

Hey,

I'm using javax.swing.text.html.HTMLEditorKit.ParserCallback, to parse and extract content from webpages. But for some reason it doesnt read the LINK tag and META tag, but reads everything else. Does anyone have any idea why?

As of now the program would read the page:

http://www.cnn.com/2004/US/10/06/dog....ap/index.html

The source of the webpage could be viewed there.

Thanks
raven

<CODE>
public class test extends ParserCallback {
/** The tag currently being processed */
private HTML.Tag currentTag = null;
private boolean toParse = true;
private String justText = "";

public test(){
HTMLEditorKit.Parser parser = new ParserDelegator();

//Collections.sort(htmlFileNames);
try{
BufferedReader reader = new BufferedReader( new InputStreamReader( new URL( "http://www.cnn.com/2004/US/10/06/dog.attack.ap/index.html" ).openStream() ) );
// parse the HTML document
parser.parse(reader, this, false);
} catch (IOException e){e.printStackTrace(System.out);}

}

/** This method is called when the HTML parser encounts the beginning
* of a tag that means that the tag is paired by an end tag and it's
* not an empty one.
*/
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
currentTag = t;
System.out.println(t);
if (HTML.Tag.META == t){

Enumeration e = a.getAttributeNames();
while(e.hasMoreElements()) {

HTML.Attribute tempAtt = (HTML.Attribute) e.nextElement();
if( tempAtt == HTML.Attribute.CONTENT ){

justText += " " + a.getAttribute(tempAtt);
}
}
}

}//handleStartTag

public void handleEndTag(HTML.Tag t, int pos) {
}//handleStartTag

public void flush() throws BadLocationException {
} // flush

/** This method is called when the HTML parser encounts text (PCDATA)*/
public void handleText(char[] text, int pos){

if(HTML.Tag.P == currentTag){
//text of tag A
String tagText = new String(text);
justText += " "+tagText;
}// End if

}// end handleText();

private String getString(){
return justText;
}

public static void main(String[] args) {
// create a new Htmldocument handler
test htmlDocHandler = new test();

//System.out.println( htmlDocHandler.getString() );
}// main

}

</CODE>