首页 文章

使用SAX解析器构建XML文件

提问于
浏览
0

我正在解析XML维基百科数据转储,我想拉出一个页面并将其转换为带有页面精简版本的新XML文档 . 例如,在每个页面中,我只对 Headers ,ID,时间戳,用户名和文本感兴趣 .

这是一个完整的维基百科页面:

<page>
<title>AccessibleComputing</title>
<ns>0</ns>
<id>10</id>
<redirect title="Computer accessibility" />
<revision>
  <id>381202555</id>
  <timestamp>2010-08-26T22:38:36Z</timestamp>
  <contributor>
    <username>OlEnglish</username>
    <id>7181920</id>
  </contributor>
  <minor />
  <comment>[[Help:Reverting|Reverted]] edits by [[Special:Contributions/76.28.186.133|76.28.186.133]] ([[User talk:76.28.186.133|talk]]) to last version by Gurch</comment>
  <text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from CamelCase}}</text>
  <sha1 />
  </revision>
</page>

剥离完成后我想最终得到的结果是这样的:

<page>
  <title>AccessibleComputing</title>
  <id>10</id>
  <revision>
    <timestamp>2010-08-26T22:38:36Z</timestamp>
    <contributor>
      <username>OlEnglish</username>
    </contributor>
    <text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from CamelCase}}</text>
  </revision>
</page>

由于这些文件的大小我知道我不能使用DOM来处理这个问题 . 我知道如何设置SAX解析器,但在解析文档时构建新XML文件的最佳方法是什么?

谢谢

2 回答

  • 0

    您可以使用XMLFilterImpl并只保留您需要的内容,这里有一个想法,输入和输出都是流,因此它可以处理任何大小的XML

    XMLReader xr = new XMLFilterImpl(XMLReaderFactory.createXMLReader()) {
            public void startElement(String uri, String localName, String qName, Attributes atts)
                    throws SAXException {
                if (qName.equals("page")) {
                    super.startElement(uri, localName, qName, atts);
                }
            }
    
            public void endElement(String uri, String localName, String qName) throws SAXException {
                if (qName.equals("page")) {
                    super.endElement(uri, localName, qName);
                }
            }
    
            public void characters(char[] ch, int start, int length) throws SAXException {
                //super.characters(ch, start, length);
            }
        };
        Source src = new SAXSource(xr, new InputSource("1.xml"));
        Result res = new StreamResult(System.out);
        TransformerFactory.newInstance().newTransformer().transform(src, res);
    
  • 2

    在这里,我使用SAX Parser实现解析,提取 Headers 元素和 Headers

    维基百科转储文件中的重定向元素中的属性 .

    package parser;
    
    import java.io.File;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.io.PrintWriter;
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.Iterator;
    import java.util.List;
    
    import javax.xml.parsers.ParserConfigurationException;
    import javax.xml.parsers.SAXParser;
    import javax.xml.parsers.SAXParserFactory;
    
    import org.xml.sax.Attributes;
    import org.xml.sax.SAXException;
    import org.xml.sax.helpers.DefaultHandler;
    
    
    
     public class SAXHandler extends DefaultHandler {
    
    List<String> list;
    int count=0,counter=0;
    int MAX_SIZE=100000;
    String temp="";
    int counterz=0;
    public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException{
    
            long start = System.currentTimeMillis();            
    
            SAXHandler saxhandler=new SAXHandler();
            saxhandler.assign();
            saxhandler.parseDoc();
    
            long end = System.currentTimeMillis();
            System.out.println("Time taken to write is " + (end - start) + "msecs");    
    
    }
    
    void assign(){
        list = new ArrayList<String>(); 
    }
    
    void parseDoc() throws ParserConfigurationException, SAXException, IOException{
    
        SAXParserFactory spf = SAXParserFactory.newInstance();
        SAXParser sp = spf.newSAXParser();
        sp.parse("D:\\XMLParsing_Files\\enwiki-20120902-pages-articles-multistream.xml", this);
        writeToFile(list); // for writing the end elements
    }
    
    public void startDocument() throws SAXException {
    
    }
    
    public void endDocument() throws SAXException {
    
    }
    
    public void startElement(String uri, String localName,String qName, Attributes attributes)throws SAXException {
    
        if(qName.equalsIgnoreCase("redirect"))
        {
            list.add(attributes.getValue("title"));
            count++;
            if(count==MAX_SIZE)
            {
                try {
                    writeToFile(list);
                } catch (IOException e) {
                    e.printStackTrace();
                }
                list.clear();
                count=0;
            }
        }
    
    }
    
    public void endElement(String uri, String localName, String qName)throws SAXException {
    
       if(qName.equalsIgnoreCase("title"))
       {
           list.add(temp);
           count++;
           if(count==MAX_SIZE)
           {
            try {
                writeToFile(list);
            } catch (IOException e) {
                e.printStackTrace();
            }
            list.clear();
            count=0;
           }
       }
    
    }
    
    public void characters(char ch[], int start, int length)throws SAXException {
    
        temp="";
        temp=new String(ch,start,length);
    }
    
    void writeToFile(List<String> list) throws IOException{
    
        Collections.sort(list);
        File file = new File("D:\\XMLParsing_Files\\Extracted_Data\\Extracted_Sorted_Data_" + getSuffix() + ".txt");
    
    
        if (!file.exists()) {
            file.createNewFile();
        }
    
        FileWriter fw = new FileWriter(file.getAbsoluteFile());
        PrintWriter pw = new PrintWriter(fw);
    
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            pw.println(it.next());
        }
        pw.println("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz");   
        pw.close();
        System.out.println(++counterz + "Done");
    }
    
    int  getSuffix(){
        counter++;
        return counter;
     }
    
    }
    

相关问题