Cascading: XML to Tuple

Posted by OrganicVeggie on December 19, 2008 at 11:38 CST

As part of my goal of using Hadoop to process XML, I also wanted to use Cascading to make my life easier. So I wrote an Operation class that effectively converts an XML file to a list of Tuples.

/**
 * Extracts a Tuple for each matching node in the document and sets the fields to text values for the various
 * specified child paths.
 * 
 * For example, if we have the document:
 * 
 * <root>
 *   <parent>
 *     <a>foo</a>
 *     <b>bar</b>
 *   </parent>
 *   <parent>
 *     <a>apples</a>
 *     <b>bananas</b>
 *   </parent>
 * </root>
 * 
 * XPathExtractor can pull out two Tuples of the form ["foo", "bar"], ["apples", "bananas"] by creating an instance
 * via:
 * 
 *   new XPathExtractor(new Fields("a", "b"), "parent", "a", "b")
 */
@SuppressWarnings("unchecked")
public class XPathExtractor extends BaseOperation implements Function
{
    private static final long serialVersionUID = -4085913360353750803L;
    
    private transient XPath xPath;
    private transient XPathExpression expression;
    
    private final String parentXpath;
    private final String[] childPaths;

    public XPathExtractor(Fields fields, String parentNodeName, String... paths)
    {
        super(1, fields);
        
        if (fields.size() != paths.length)
        {
            throw new IllegalArgumentException("Number of fields (" + fields.size() + ") and number of child paths (" + paths.length + ") must be equal.");
        }
        
        this.parentXpath = "//" + parentNodeName;
        this.childPaths = paths;
    }

    /**
     * Assumes Field names and child path names are the same.
     */
    public XPathExtractor(String parentNodeName, String... paths)
    {
        super(1, new Fields(paths));
        this.parentXpath = "//" + parentNodeName;
        this.childPaths = paths;
    }
    
    /**
     * Assumes Field names and child path names are the same.
     */
    public XPathExtractor(Fields fields, String parentNodeName)
    {
        super(1, fields);
        this.parentXpath = "//" + parentNodeName;
        
        childPaths = new String[fields.size()];
        for (int i = 0; i < fields.size(); i++)
        {
            childPaths[i] = fields.get(i).toString();
        }
    }
    
    protected XPathExpression getExpression()
    {
        if (expression != null)
        {
            return expression;
        }
        
        try
        {
            expression = getXPath().compile(parentXpath);
        }
        catch (XPathExpressionException e)
        {
            throw new OperationException("Could not compile xpath expression", e);
        }
        
        return expression;
    }

    /**
     * Method getXPath returns the XPath of this XPathOperation object.
     *
     * @return the XPath (type XPath) of this XPathOperation object.
     */
    public XPath getXPath()
      {
        if (xPath != null)
            return xPath;

        XPathFactory factory = XPathFactory.newInstance();

        xPath = factory.newXPath();
        return xPath;
    }
    
    /* (non-Javadoc)
     * @see cascading.operation.Function#operate(cascading.flow.FlowProcess, cascading.operation.FunctionCall)
     */
    @Override
    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
        InputSource source = new InputSource(new StringReader((String) functionCall.getArguments().get(0)));

        XPathExpression parentPath = getExpression();
        try
        {
            NodeList value = (NodeList) parentPath.evaluate(source, XPathConstants.NODESET);
            
            for (int i = 0; i < value.getLength(); i++)
            {
                Node parentNode = value.item(i);
                Element parentElem = (Element) parentNode;
                
                Tuple tuple = new Tuple();
                for (int childIdx = 0; childIdx < childPaths.length; childIdx++)
                {
                    String childName = childPaths[childIdx];
                    NodeList childNodes = parentElem.getElementsByTagName(childName);
                    if ((childNodes != null) && (childNodes.getLength() > 0))
                    {
                        Element childElem = (Element) childNodes.item(0);
                        tuple.add(childElem.getTextContent());
                    }
                    else
                    {
                        tuple.add("");
                    }
                }
                
                functionCall.getOutputCollector().add(tuple);
            }
        }
        catch (XPathExpressionException exception)
        {
            throw new OperationException("could not evaluate xpath expression: " + parentXpath, exception);
        }
        
    }
}


Posted in General | Tags java cascading hadoop xml

Hadoop: Files as Documents

Posted by OrganicVeggie on December 19, 2008 at 10:56 CST

I needed a way to load XML documents into Hadoop and I wanted each document to be treated as a single record. So I wrote a new FileInputFormat and a new RecordReader.

/**
 * An InputFormat text files where each file is an entire doc. Keys are the file path and values
 * are the entire file contents as text. As a side effect, no single file is splitable.
 */
public class FileDocInputFormat extends FileInputFormat<Text, Text>
{
    @Override
    public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException
    {
        reporter.setStatus(split.toString());
        return new FileDocRecordReader(job, (FileSplit) split);
    }

    @Override
    protected boolean isSplitable(FileSystem fs, Path filename)
    {
        return false;
    }
}
/**
 * Treats key as filename and the value as the entire contents of the file.
 */
public class FileDocRecordReader implements RecordReader<Text, Text>
{
    private InputStream in;
    private String pathname;
    
    public FileDocRecordReader(Configuration job, FileSplit split) 
        throws IOException
    {
        final Path file = split.getPath();
        pathname = file.getName();
        FileSystem fs = file.getFileSystem(job);
        in = fs.open(file);
    }
    
    @Override
    public synchronized void close() throws IOException
    {
        if (in != null)
        {
            in.close();
        }
        in = null;
    }

    @Override
    public Text createKey()
    {
        return new Text();
    }

    @Override
    public Text createValue()
    {
        return new Text();
    }

    @Override
    public long getPos() throws IOException
    {
        return 0;
    }

    @Override
    public float getProgress() throws IOException
    {
        return 0.0f;
    }

    @Override
    public synchronized boolean next(Text key, Text value) throws IOException
    {
        if (in == null)
        {
            return false;
        }
        
        key.set(pathname);
        
        final int READ_BLOCK_SIZE = 32768;
        byte[] data = new byte[READ_BLOCK_SIZE];
        
        int bytesRead = in.read(data);
        while (bytesRead > 0)
        {
            value.append(data, 0, bytesRead);
            bytesRead = in.read(data);
        }
        
        close();
        return true;
    }
}

 

Posted in Java | Tags java hadoop

Virtual Hosts with Apache 2.2 and Tomcat 6

Posted by OrganicVeggie on October 31, 2008 at 16:23 CDT

I was trying to get two virtual hosts with completely separate domain names working correctly. I have Apache 2.2 as a front-end and Tomcat 6 as a back-end server. I finally followed path outlined on the Confluence wiki:

http://confluence.atlassian.com/display/DISC/Using+Virtual+Hosts+on+both+Apache+and+Tomcat

1) Setup the virtual host(s) in Apache:

<virtualhost>
  ServerName  www.bealetech.com
  DocumentRoot /var/www/bealetech.com/

  RewriteEngine On

  # Check for maintenance file and redirect all requests
  RewriteCond /var/www/maintenance.html -f
  RewriteCond %{SCRIPT_FILENAME} !maintenance.html
  RewriteRule ^.*$ /var/www/maintenance.html [L]

  ProxyRequests off
  ProxyPass / http://www.bealetech.com:8080/
  ProxyPassReverse / http://www.bealetech.com:8080/

  <proxy>
    Order deny,allow
    Allow from all
  </proxy>
</virtualhost>

Note the use of the full hostname in the ProxyPass and ProxyPassReverse entries. Without the FQDN, Tomcat will not be able to resolve it's virtual host settings correctly.

2) Setup separate virtual hosts in Tomcat:
$TOMCAT/conf/server.xml

<host xmlnamespaceaware="false" xmlvalidation="false" autodeploy="true" unpackwars="true" appbase="vhosts/bealetech.com" name="www.bealetech.com">
    <context dbeug="0">
  </context>
</host>

Note the user of a separate folder for the blog's appBase. Inside the vhosts directory you will find one folder for each virtual host. And inside each of those folders, you will find a single ROOT webapp.

Posted in General | Tags setup apache tomcat

New Blog

Posted by OrganicVeggie on October 31, 2008 at 16:15 CDT

I finally got tired of memory problems with running Typo 5.1 under Mongrel. I never figured out if it was a problem with Typo, Mongrel or my own lack of knowledge regarding the two. I also tried to get Typo 5.1.3 running with JRuby and Tomcat, but that was a nightmare as well.

In the end, I gave up and switched to Apache Roller. It does lack a few of the fancy bells and whistles, but it seems to work well. And running Roller under Tomcat seems to be integrate with my existing infrastructure.

Posted in General | Tags java apache blog

Deploy JRuby on Rails to Tomcat with Capistrano

Posted by OrganicVeggie on October 3, 2008 at 22:37 CDT

I’ve been slowly migrating to JRuby on Rails under Tomcat, since Mongrels has been consistently unstable for me. Unfortunately, for reasons not worth explaining at this time, I couldn’t actually build the WAR on the server. As a Java developer, the traditional would be to build the WAR on my machine, use scp to copy to the server, then ssh into the box, move the WAR and restart Tomcat.

Whew!

Of course, what happens if I have database changes to include? Hrmmm.

I’m also lazy. And I don’t like to repeat myself. So I put together a very basic Capistrano deployment script that I use for some of my JRoR deployments:

set :application, "Example"

set :scm_user, "svnuser"

# prompt for SCM password
set :scm_repository,  "svn.example.com/svn/repo/trunk/Example"

set :webapp,  "example"
set :war,     "#{webapp}.war"
set :tomcat_webapps,  "/opt/tomcat/webapps"

set :deploy_to, "/opt/webapps/#{application}"

server "example.com", :app, :web, :db, :primary => true

set :user, "myuser"

task :cp_war, :roles => :app do
  put(File.read("#{war}"), "#{deploy_to}")
  run "#{try_sudo} cp -f #{deploy_to}/#{war} #{tomcat_webapps}/ROOT.war"
end

desc <<-DESC
  Updates the project on the remote server and updates the symlink.
  Prompts for the source control password.
DESC
task :prompt_update do
  set(:scm_password) do
    Capistrano::CLI.password_prompt "Password for SCM user #{scm_user}: "
  end
  set :repository,  "https://#{scm_user}:#{scm_password}@#{scm_repository}"

  deploy.update
end

namespace :deploy do
  task :default do
    prompt_update
    migrate
    cp_war
    restart
  end

  task :restart do
    run "#{try_sudo} /etc/init.d/tomcat stop && #{try_sudo} rm -fR #{tomcat_webapps}/ROOT && #{try_sudo} /etc/init.d/tomcat start"
  end
end

Posted in Rails | Tags deployment capistrano ruby rails jruby tomcat script

Search


Calendar

« July 2009
SunMonTueWedThuFriSat
   
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 
       
Today

Links

Tag Cloud

ajax apache article beer beryl brewing capistrano cigarphiles cigars compiz deploy deployment email hadoop java jruby kölsch mongrel rails refactoring ruby script setup spring struts tomcat tutorial ubuntu webdev xorg

Feeds

Referers

Navigation

Games

Photos

Ads