2008-04-22
Html解析生成纯文本-使用SAX以及htmlcleaner
package testlucene;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
public class SAXxhtml extends DefaultHandler {
/**
* Logger for this class
*/
private static final Logger logger = Logger.getLogger(SAXxhtml.class);
public StringBuffer sb = new StringBuffer();
public boolean usable = true;
private String sPath = "";
public SAXxhtml() {
super();
// TODO Auto-generated constructor stub
// PropertyConfigurator.configure("log4j.properties");
BasicConfigurator.configure();
}
public void startElement(String namespaceURI, String localName,
String rawName, Attributes atts) throws SAXException {
if (rawName.equals("style") || rawName.equals("script")) {
usable = false;
}
}
// 解析完成后的统计工作
public void endDocument() throws SAXException {
try {
PrintWriter pw = new PrintWriter(new FileOutputStream(sPath));
pw.print(sb.toString());
pw.flush();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void characters(char[] ch, int start, int length) {
String charEncontered = new String(ch, start, length);
/*
* if (!charEncontered.startsWith("<!")||!charEncontered.startsWith("<
* ")) { sb.append("\n"); sb.append(charEncontered); }
*/
if (usable) {
sb.append(charEncontered);
sb.append("\n");
}
usable = true;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
/*
* SAXParserFactory spf = SAXParserFactory.newInstance(); XMLReader
* xmlReader = null; SAXParser saxParser = null; try { //
* 创建一个解析器SAXParser对象 saxParser = spf.newSAXParser(); //
* 得到SAXParser中封装的SAX XMLReader xmlReader = saxParser.getXMLReader();
* saxParser.parse(new File("d:/sina.xml"), new SAXxhtml()); } catch
* (Exception ex) { logger.error("main(String[]) - " + ex, ex);
* System.exit(1); }
*/
}
@Override
public void endElement(String arg0, String arg1, String arg2)
throws SAXException {
// TODO Auto-generated method stub
super.endElement(arg0, arg1, arg2);
}
public void parse(String sPath, String Scontent) {
this.sPath = sPath;
try {
// System.out.println(Scontent);
HtmlCleaner hc = new HtmlCleaner(Scontent);
hc.clean();
PrintWriter pw = new PrintWriter(new FileOutputStream("e:/tmpfile/tmp.txt"));
pw.print(sb.toString());
pw.flush();
pw.close();
FileInputStream fis = new FileInputStream(new File("e:/tmpfile/tmp.txt"));
String mid = hc.getBrowserCompactXmlAsString();
StringReader sr = new StringReader(mid);
InputSource iSrc = new InputSource(sr);
System.out.println(iSrc.toString());
SAXParserFactory spf = SAXParserFactory.newInstance();
XMLReader xmlReader = null;
SAXParser saxParser = null;
// 创建一个解析器SAXParser对象
saxParser = spf.newSAXParser();
// 得到SAXParser中封装的SAX XMLReader
xmlReader = saxParser.getXMLReader();
saxParser.parse(fis, new SAXxhtml());
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
具体思路是Html->xml,然后就可以用sax对xml解析,但是程序总调不通,有人能帮助解决一下么?
发表评论
提醒: 该博客已发表在公共论坛,博客所有留言会成为论坛回贴,留言请注意遵守论坛发贴规则
- 浏览: 19047 次
- 性别:

- 来自: 北京

- 详细资料
搜索本博客
我的相册
1.jpg
共 23 张
共 23 张
最近加入圈子
最新评论
-
凌晨写博客
终于可以留言了,飘过来看看==||
-- by yj_grace -
NGOSS:下一代网络运营支 ...
精华尽在SID
-- by partech -
NGOSS:下一代网络运营支 ...
电信的东西比较多,这个只能算是介绍
-- by everlasting_188 -
给女孩的话:当他不再爱你 ...
爱和被爱,有得选的么? 如果可以选,我也希望能完全忘记并完全投入“爱你并正派的男 ...
-- by zbobo -
比較Java和C++的總體擁有 ...
delete 写道魔力猫咪 写道不过这种由厂商或者由厂商赞助的调查报告有多少真实 ...
-- by jimmy_c






评论排行榜