HTML Parsers in Java

| No TrackBacks

Reference: HTML Parser 对于网页格式中的文本,提取其内容
Because need to parse HTML page to get some parameters of Java Applet tag.
The HTML Parser is a Java library. It support those tag below.

NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter appletFilter = new NodeClassFilter(AppletTag.class);
NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);
NodeFilter frameFilter = new NodeClassFilter(FrameTag.class);
NodeFilter scriptFilter = new NodeClassFilter(ScriptTag.class);
NodeFilter formFilter = new NodeClassFilter(FormTag.class);
NodeFilter objectFilter = new NodeClassFilter(ObjectTag.class);
NodeFilter remarkFilter = new NodeClassFilter(RemarkNode.class);
NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);

The complete java code is:

import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.tags.AppletTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

public class GetTaiFex {

    static List<String> findout = new ArrayList<String>();
    public static void main(String[] args) {
        try {
            ConnectionManager cm = new ConnectionManager();
            Hashtable<String, String> properties = new Hashtable<String, String>();
            properties.put("User-Agent", "IE/6.0");
            //properties.put("Cookie", "ValidLogin=1");
            cm.setRequestProperties(properties);
            Parser.setConnectionManager(cm);
            Parser parser = new Parser(
                    "http://java.sun.com/applets/jdk/1.4/demo/applets/Animator/example1.html");
            NodeFilter appletFilter = new NodeClassFilter(AppletTag.class);
            NodeList list = parser.parse(appletFilter);
            processNodeList(list, "param");
            System.out.println(findout);
        } catch (ParserException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    private static void processNodeList(NodeList list, String keyword) {
        SimpleNodeIterator iterator = list.elements();
        while (iterator.hasMoreNodes()) {
            Node node = iterator.nextNode();
            NodeList childList = node.getChildren();
            if (null == childList)
                {
                String result = node.getText();
                if (result.indexOf(keyword) != -1){
                    findout.add(result);
                }
                }
            else{
                processNodeList(childList, keyword);
            }
        }
    }
}

No TrackBacks

TrackBack URL: http://server.everfine.com.tw/blog/mt-tb.cgi/258

March 2010

Sun Mon Tue Wed Thu Fri Sat
  1 2 3 4 5 6
7 8 9 10 11 12 13
14 15 16 17 18 19 20
21 22 23 24 25 26 27
28 29 30 31      

Archives

Powered by Movable Type 4.33-en

About this Entry

This page contains a single entry by philipz published on June 30, 2009 5:43 PM.

Find recent content on the main index or look in the archives to find all content.