Reference: HTML Parser 对于网页格式中的文本,提取其内容
Because need to parse HTML page to get some parameters of Java Applet tag.
The HTML Parser is a Java library. It support those tag below.
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter appletFilter = new NodeClassFilter(AppletTag.class);
NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);
NodeFilter frameFilter = new NodeClassFilter(FrameTag.class);
NodeFilter scriptFilter = new NodeClassFilter(ScriptTag.class);
NodeFilter formFilter = new NodeClassFilter(FormTag.class);
NodeFilter objectFilter = new NodeClassFilter(ObjectTag.class);
NodeFilter remarkFilter = new NodeClassFilter(RemarkNode.class);
NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);
The complete java code is:
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.tags.AppletTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
public class GetTaiFex {
static List<String> findout = new ArrayList<String>();
public static void main(String[] args) {
try {
ConnectionManager cm = new ConnectionManager();
Hashtable<String, String> properties = new Hashtable<String, String>();
properties.put("User-Agent", "IE/6.0");
//properties.put("Cookie", "ValidLogin=1");
cm.setRequestProperties(properties);
Parser.setConnectionManager(cm);
Parser parser = new Parser(
"http://java.sun.com/applets/jdk/1.4/demo/applets/Animator/example1.html");
NodeFilter appletFilter = new NodeClassFilter(AppletTag.class);
NodeList list = parser.parse(appletFilter);
processNodeList(list, "param");
System.out.println(findout);
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void processNodeList(NodeList list, String keyword) {
SimpleNodeIterator iterator = list.elements();
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
NodeList childList = node.getChildren();
if (null == childList)
{
String result = node.getText();
if (result.indexOf(keyword) != -1){
findout.add(result);
}
}
else{
processNodeList(childList, keyword);
}
}
}
}

