Examples of com.digitalpebble.behemoth.BehemothDocument

com.digitalpebble.behemoth.BehemothDocument
Implementation of a Document using Hadoop primitives. A BehemothDocument consists of a URL, content type, binary content, metadata and @class Annotations.

    public long generate(boolean recurse) throws IOException {
        long result = 0;
        // read from input path
        // create new Content object and add it to the SequenceFile
        Text key = new Text();
        BehemothDocument value = new BehemothDocument();
        SequenceFile.Writer writer = null;
        try {
            Configuration conf = getConf();
            FileSystem fs = output.getFileSystem(conf);
            writer = SequenceFile.createWriter(fs, conf, output,
                    key.getClass(), value.getClass());
            PerformanceFileFilter pff = new PerformanceFileFilter(writer, key,
                    value, conf, reporter);
            // iterate on the files in the source dir
            result = processFiles(conf, input, recurse, pff);

View Full Code Here


    public void testTextExtractionTika() {
        // Create a very simple Behemoth document
        String text = "<HTML><TITLE>A TITLE</TITLE><BODY>This is a <B>simple</B> test</HTML>";
        String url = "dummyDoc.html";
        BehemothDocument doc = new BehemothDocument();
        doc.setContent(text.getBytes());
        doc.setUrl(url);
        // don't set the text as such
        // or the content type
        BehemothDocument[] outputs = tika.process(doc, null);
        // the output should contain only one document
        assertEquals(1, outputs.length);
        BehemothDocument output = outputs[0];
        // the output document must be marked as text/html
        assertEquals("text/html", output.getContentType());
        // and have the following text
        String outText = output.getText().trim().replaceAll("\\n+", "\n");
        assertEquals("A TITLE\nThis is a simple test", outText);
    }

View Full Code Here

    }


    private String testLanguage(String text) {
        // Create a very simple Behemoth document
        String url = "dummyDoc.html";
        BehemothDocument doc = new BehemothDocument();
        doc.setContent(text.getBytes());
        doc.setText(text);
        doc.setUrl(url);
        // don't set the text as such
        // or the content type
        BehemothDocument[] outputs = langid.process(doc, null);
        // the output should contain only one document
        assertEquals(1, outputs.length);
        BehemothDocument output = outputs[0];
        // the output document should have a language metadata
        // and its value should be french
        Writable lang = output.getMetadata().get(
                LanguageIdProcessor.languageMDKey);
        return lang.toString();
    }

View Full Code Here


  public void map(Text key, Content content,
      OutputCollector<Text, BehemothDocument> output, Reporter reporter)
      throws IOException {


    BehemothDocument behemothDocument = new BehemothDocument();


    int status = Integer.parseInt(content.getMetadata().get(
        Nutch.FETCH_STATUS_KEY));
    if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
      // content not fetched successfully, skip document
      LOG.debug("Skipping " + key
          + " as content is not fetched successfully");
      return;
    }


    // TODO store the fetch metadata in the Behemoth document
    // store the binary content and mimetype in the Behemoth document


    String contentType = content.getContentType();
    byte[] binarycontent = content.getContent();
    behemothDocument.setUrl(key.toString());
    behemothDocument.setContent(binarycontent);
    behemothDocument.setContentType(contentType);
    output.collect(key, behemothDocument);
  }

View Full Code Here


    public void testTokenizationANNIE() {
        // Create a very simple Behemoth document
        String text = "This is a simple test";
        String url = "dummyURL";
        BehemothDocument doc = new BehemothDocument();
        doc.setContent(text.getBytes());
        doc.setUrl(url);
        doc.setContentType("text/plain");
        // don't set the text as such
        // or any metadata at all
        BehemothDocument[] outputs = gate.process(doc, null);
        // the output should contain only one document
        assertEquals(1, outputs.length);
        BehemothDocument output = outputs[0];
        // the output document must have 5 annotations of type token
        // see gate.annotations.filter in Configuration above
        assertEquals(5, output.getAnnotations().size());
    }

View Full Code Here


    public void testTokenizationANNIE2() {
        // Create a very simple Behemoth document
        String text = "This is a simple test";
        String url = "dummyURL";
        BehemothDocument doc = new BehemothDocument();
        doc.setText(text);
        doc.setUrl(url);
        doc.setContentType("text/plain");
        // don't set the text as such
        // or any metadata at all
        BehemothDocument[] outputs = gate.process(doc, null);
        // the output should contain only one document
        assertEquals(1, outputs.length);
        BehemothDocument output = outputs[0];
        // the output document must have 5 annotations of type token
        // see gate.annotations.filter in Configuration above
        assertEquals(5, output.getAnnotations().size());
    }

View Full Code Here


    public void testTokenizationANNIE3() {
        // Create a very simple Behemoth document
        String content = "<H1>This is a simple test</H1>";
        String url = "dummyURL";
        BehemothDocument doc = new BehemothDocument();
        doc.setContent(content.getBytes());
        doc.setUrl(url);
        doc.setContentType("text/html");
        // don't set the text as such
        // or any metadata at all
        BehemothDocument[] outputs = gate.process(doc, null);
        // the output should contain only one document
        assertEquals(1, outputs.length);
        BehemothDocument output = outputs[0];
        // the output document must have 5 annotations of type token
        // see gate.annotations.filter in Configuration above
        assertEquals(5, output.getAnnotations().size());


        assertEquals(content.length() - 8, output.getText().length());
        // TODO test that there is initial markup as well
    }

View Full Code Here


    public void testTokenizationUIMA() {
        // Create a very simple Behemoth document
        String text = "This is a simple test";
        String url = "dummyURL";
        BehemothDocument doc = new BehemothDocument();
        doc.setContent(text.getBytes());
        doc.setText(text);
        doc.setUrl(url);
        doc.setContentType("text/plain");
        // don't set the text as such
        // or any metadata at all
        BehemothDocument[] outputs = uima.process(doc, null);
        // the output should contain only one document
        assertEquals(1, outputs.length);
        BehemothDocument output = outputs[0];
        // the output document must have 5 annotations of type token
        // see gate.annotations.filter in Configuration above
        assertEquals(5, output.getAnnotations().size());
    }

View Full Code Here

      response = new HttpResponse(binarycontent);
    } catch (ProtocolException e) {
      return;
    }


    BehemothDocument behemothDocument = new BehemothDocument();


    behemothDocument.setUrl(uri);
    newKey.set(uri);


    String contentType = response.getHeader(HttpHeaders.CONTENT_TYPE);
    behemothDocument.setContentType(contentType);
    behemothDocument.setContent(response.getContent());


    MapWritable md = behemothDocument.getMetadata(true);


    // add the metadata
    for (String mdkey : response.getHeaders().names()) {
      String value = response.getHeaders().get(mdkey);
      md.put(new Text(mdkey), new Text(value));

View Full Code Here

0 1

TOP

Related Classes of com.digitalpebble.behemoth.BehemothDocument

com.digitalpebble.behemoth.gate.GATECorpusGenerator

com.digitalpebble.behemoth.gate.GATEProcessorTest

com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob

com.digitalpebble.behemoth.io.sequencefile.SequenceFileConverterMapper

com.digitalpebble.behemoth.io.warc.WARCConverterJob

com.digitalpebble.behemoth.languageidentification.LanguageIDProcessorTest

com.digitalpebble.behemoth.tika.TikaProcessorTest

com.digitalpebble.behemoth.uima.UIMAProcessorTest

com.digitalpebble.behemoth.util.ContentExtractor

com.digitalpebble.behemoth.util.CorpusGenerator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.