Package com.findwise.hydra.stage.tika

Source Code of com.findwise.hydra.stage.tika.TikaStageTest

package com.findwise.hydra.stage.tika;

import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.util.Arrays;

import org.junit.Test;

import com.findwise.hydra.DocumentFile;
import com.findwise.hydra.DocumentFileRepository;
import com.findwise.hydra.DocumentID;
import com.findwise.hydra.local.Local;
import com.findwise.hydra.local.LocalDocument;

import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertFalse;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyString;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

public class TikaStageTest {
  @Test
  public void testByDefaultExtractsContentMetadataAndLanguage() throws Exception {
    TikaStage stage = new TikaStage();
    LocalDocument doc = buildDocumentWithResourceFile("/test.pdf");
    stage.process(doc);

    assertThat(doc.getContentFieldAsString("test_pdf_Author"), equalTo("Bertrand Delacrétaz"));
    assertThat(doc.getContentFieldAsString("test_pdf_content"), containsString("Tika is a toolkit for detecting and extracting metadata and structured text content"));
    assertThat(doc.getContentFieldAsString("test_pdf_language"), equalTo("en"));
  }

  @Test
  public void testMetadataExtractionCanBeDisabled() throws Exception {
    TikaStage stage = new TikaStage();
    stage.setAddMetaData(false);
    LocalDocument doc = buildDocumentWithResourceFile("/test.pdf");
    stage.process(doc);
    assertFalse("Document should not contain metadata", doc.hasContentField("test_pdf_Author"));
  }

  @Test
  public void testLanguageDetectionCanBeDisabled() throws Exception {
    TikaStage stage = new TikaStage();
    stage.setAddLanguage(false);
    LocalDocument doc = buildDocumentWithResourceFile("/test.pdf");
    stage.process(doc);
    assertFalse("Document should not contain language", doc.hasContentField("test_pdf_language"));
  }

  /**
   * Creates a LocalDocument containing an attached DocumentFile whose contents are taken from
   * the given resource.
   *
   */
  private LocalDocument buildDocumentWithResourceFile(String resourcePath) throws IOException, URISyntaxException {
    LocalDocument doc = new LocalDocument();
    doc.setDocumentFileRepository(buildDocumentFileRepositoryWithResource(resourcePath));
    return doc;
  }

  /**
   *  Returns a mock implementation of a DocumentFileRepository containing a single file.
   *
   *  The contents of the file are read using this.getClass().getResourceAsStream(resourcePath)
   */
  private DocumentFileRepository buildDocumentFileRepositoryWithResource(String resourcePath) throws IOException, URISyntaxException {
    String fileName = resourcePath.replaceFirst(".*/", "");
    InputStream inputStream = this.getClass().getResourceAsStream(resourcePath);
    assertThat("Could not find resource with path: " + resourcePath, inputStream, notNullValue());

    // Mock a DocumentFile that has the proper input stream.
    DocumentFile<Local> file = mock(DocumentFile.class);
    when(file.getStream()).thenReturn(inputStream);

    // Mock the fileRepository and make it return the proper file name and our mocked DocumentFile
    DocumentFileRepository fileRepository = mock(DocumentFileRepository.class);
    when(fileRepository.getFileNames(any(DocumentID.class))).thenReturn(Arrays.asList(fileName));
    when(fileRepository.getFile(anyString(), any(DocumentID.class))).thenReturn(file);

    return fileRepository;
  }
}
TOP

Related Classes of com.findwise.hydra.stage.tika.TikaStageTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.