Archive for March 21st, 2010

21
Mar
10

Chop up XML into Many Files based on Tag

This page describes the process of chopping up an xml/xhtml file into many parts using SAX.

Background

Often times one is tasked with the responsibility of breaking up huge xml documents into smaller parts for easer processing. You could manually open up a BufferedReader and read the file line by line to break it up however this is tedious. A better way is to use SAX to parse the document.

In the example below the input file is piped into the Java program below. This is just a quick and dirty example. It can be used for a basis of your own implementation.

Input Data

The following is the input document.


<!-- Want to chop this into invoice1.txt invoice2.txt -->


	<table border="1">
		<tbody><tr>
			<td>cell 1</td>
			<td>cell 2</td>
		</tr>
	</tbody></table>
	<table>
		<tbody><tr>
			<td>cell 3</td>
			<td>cell 4</td>
		</tr>
	</tbody></table>

Implementation

package com.test;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * This class accepts an HTML containing tables and chops the file up one
 * file for each table the source file contains.
 * 
 * This class is not thread safe.
 * 
 * To run from the command line type:
 * cat htmltest_clean.xml | java com.test.ChopTables [output directory]
 * 
 * @author verma
 *
 */
public class ChopTables extends DefaultHandler{
	private PrintWriter pw;
	private int fileCount = 0;
	private boolean insideCell;
	private static String baseDir = "";
	
	public ChopTables(){
	}

	public void runExample() {
		parseDocument(System.in);
		System.out.println("Successfully wrote " + fileCount + " files.");
	}

	private void parseDocument(InputStream is) {

		//get a factory
		SAXParserFactory spf = SAXParserFactory.newInstance();
		try {

			//get a new instance of parser
			SAXParser sp = spf.newSAXParser();

			//parse the file and also register this class for call backs
//			InputStream is = getClass().getResourceAsStream("/com/test/htmltest_clean.xml");
			
			sp.parse(is, this);

		}catch(SAXException se) {
			se.printStackTrace();
		}catch(ParserConfigurationException pce) {
			pce.printStackTrace();
		}catch (IOException ie) {
			ie.printStackTrace();
		}
	}


	//Event Handlers
	public void startElement(String uri, String localName, String qName,
			Attributes attributes) throws SAXException {
		// reset
		try {
			if (qName.equalsIgnoreCase("table")) {
				String filename = null;
				if (baseDir != null && baseDir.length() > 0) {
					filename = baseDir + "/" + "invoice" + ++fileCount
							+ ".inc.html";
				} else {
					filename = "invoice" + ++fileCount + ".inc.html";
				}

				pw = new PrintWriter(new File(filename), "8859_1");
				pw.write("<table>");
			} else if (qName.equalsIgnoreCase("tr")) {
				pw.write("<tbody><tr>");
			} else if (qName.equalsIgnoreCase("td")) {
				insideCell = true;
				pw.write("<td>");

			}
		} catch (IOException e) {
			e.printStackTrace();
			throw new RuntimeException(e);
		}
	}

	public void characters(char[] ch, int start, int length) throws SAXException {
		if(insideCell) {			
			StringBuffer buffer = new StringBuffer(length);
			for(int i=0;i<length;i++) buffer.append(ch[i+start]);="" strtowrite="buffer.toString();" system.out.println(strtowrite);="" if(!="" .equals(strtowrite.trim()))="" pw.write(strtowrite);="" }="" public="" void="" endelement(string="" uri,="" localname,="" string="" qname)="" throws="" saxexception="" if="" (qname.equalsignorecase(="" table="" ))="" {="" pw.write(=""></length;i++)></td></tr></tbody></table>");
			pw.close();
			pw = null;
		} else if (qName.equalsIgnoreCase("tr")) {
			pw.write("");
		} else if (qName.equalsIgnoreCase("td")) {
			insideCell=false;
			pw.write("");
		} 
	}

	public static void main(String[] args){
		if(args.length == 0 || args[0]==null || "".equals(args[0].trim())) {
			throw new IllegalArgumentException("must specifiy base dir to expand files to. ");
		}
		baseDir = args[0];		
		ChopTables spe = new ChopTables();
		spe.runExample();
	}

}

Test the Implementation

cat htmltest_clean.xml | java com.test.ChopTables [output directory]

That’s all for now!




Follow

Get every new post delivered to your Inbox.

Join 49 other followers