Package edu.ucla.sspace.text.corpora

Source Code of edu.ucla.sspace.text.corpora.SemEvalLexSubReader$SemEvalHandler

package edu.ucla.sspace.text.corpora;

import edu.ucla.sspace.text.CorpusReader;
import edu.ucla.sspace.text.Document;
import edu.ucla.sspace.text.StringDocument;


import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.helpers.DefaultHandler;

* Reads the xml corpus files for the SemEval 2010 Lexical Substition task,
* available <a
* href=">here</a>.  Each
* file contains all of the contexts for a single word.  The xml files should be
* unchanged from their original format. 
* </p>
* This {@link CorpusReader} returns documents in the following format:
* </br>
* word_instance_id text ... ||| *focus_word* text ...
* </p>
* Note that this is implemented as a {@link DefaultHandler} for a {@link
* SAXParser}.
* @author Keith Stevens
public class SemEvalLexSubReader extends DefaultHandler
                                 implements CorpusReader<Document> {

     * {@inheritDoc}
    public Iterator<Document> read(Reader reader) {
        SAXParserFactory saxfac = SAXParserFactory.newInstance();

        // Ignore a number of xml features, like dtd grammars and validation.
        try {
            saxfac.setFeature("", false);
            saxfac.setFeature("", false);
            saxfac.setFeature("", false);
            saxfac.setFeature("", false);
            saxfac.setFeature("", false);
            SAXParser saxParser = saxfac.newSAXParser();
            SemEvalHandler handler = new SemEvalHandler();
            saxParser.parse(new InputSource(reader), handler);
            return handler.contexts.iterator();
        } catch (IOException ioe) {
            throw new IOError(ioe);
        } catch (SAXNotRecognizedException e1) {
            throw new RuntimeException(e1);
        }catch (SAXNotSupportedException e1) {
            throw new RuntimeException(e1);
        } catch (ParserConfigurationException e1) {
            throw new RuntimeException(e1);
        } catch (SAXException e) {
            throw new RuntimeException(e);

     * {@inheritDoc}
    public Iterator<Document> read(File file) {
        try {
            return read(new FileReader(file));
        } catch (FileNotFoundException fnfe) {
            throw new IOError(fnfe);

    public class SemEvalHandler extends DefaultHandler {

         * The list of processed contexts.
        List<Document> contexts;

         * A boolean marker signifying that the parser is in the context tag.
        private boolean inContext;

         * A boolean marker signifying that the parser is in the head tag.
        private boolean inHead;

         * A boolena marker signifying that the parser is in the lexelt tag.
        private boolean inLexElement;

         * The current instance id.
        private String instanceId;

         * The current instance lexical id
        private String lexicalId;

         * A {@link StringBuilder} for forming the current context.
        private StringBuilder context;

        public SemEvalHandler() {
            contexts = new ArrayList<Document>();
            inContext = false;
            inHead = false;
            inLexElement = false;
            context = new StringBuilder();

         * Extracts the instance id for the current context.
        public void startElement(String uri, String localName,
                                 String name, Attributes atts)
                throws SAXException {
            if (name.equals("lexelt")) {
                inLexElement = true;
                lexicalId = atts.getValue("item");
            } else if (name.equals("instance")) {
                instanceId = atts.getValue("id") + " ";
            } else if (name.equals("context"))
                inContext = true;
            else if (name.equals("head"))
                inHead = true;

         * Ends the current context and stores all of the information in {@code
         * contexts}.
        public void endElement(String uri, String localName, String name)
                throws SAXException {
            if (name.equals("head"))
                inHead = false;
            else if (name.equals("lexelt"))
                inLexElement = false;
            else if (name.equals("context")) {
                inContext = false;
                contexts.add(new StringDocument(context.toString()));

         * Appends the text to the current context.
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            if (inContext)
                context.append(new String(ch, start, length));
            if (inHead)
                context.append(" |||| ").append(new String(ch,start,length));

Related Classes of edu.ucla.sspace.text.corpora.SemEvalLexSubReader$SemEvalHandler

Copyright © 2018 All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact