Package org.apache.any23.extractor.html.HTMLDocument

Examples of org.apache.any23.extractor.html.HTMLDocument.TextField


                configuration,
                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
                extractorGroup,
                cth
        );
        instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
        return instance;
    }
View Full Code Here


    private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
        String fromRequest = getFormatFromRequest(request);
        if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
            return fromRequest;
        }
        MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
        if (result == null) {
            return null;
        }
        else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
            return "turtle";
        }
        else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
            return "n3";
        }
        else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
            return "nq";
        }
        else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
            return "rdf";
        }
        else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
            return "nt";
        }
        else {
            return "turtle";    // shouldn't happen
        }
View Full Code Here

            );
            return;
        }
        log("Attempting conversion to '" + format + "' from POST body");
        responder.runExtraction(
                new ByteArrayDocumentSource(
                        req.getInputStream(),
                        Servlet.DEFAULT_BASE_URI,
                        getContentTypeHeader(req)
                ),
                eps,
View Full Code Here

    private File getFile() {
        return file;
    }

    public DocumentSource getOpener(String baseURI) {
        return new FileDocumentSource(getFile(), baseURI);
    }
View Full Code Here

        }
    }

    protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
            throws IOException, URISyntaxException {
        return new HTTPDocumentSource(httpClient, uri);
    }
View Full Code Here

        @Override
        protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
                throws IOException, URISyntaxException {
            requestedURI = uri;
            if(content != null) {
                return new StringDocumentSource(content, uri);
            } else {
                return super.createHTTPDocumentSource(httpClient, uri);
            }
        }
View Full Code Here

            if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
                type = req.getParameter("type");
            }
            log("Attempting conversion to '" + format + "' from body parameter");
            responder.runExtraction(
                    new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_URI, type),
                    eps,
                    format,
                    report, annotate
            );
            return;
View Full Code Here

        ps.println("</issueReport>");

    }

    private void printReport(String msg, Throwable e, ExtractionReport er, PrintStream ps) {
        XMLValidationReportSerializer reportSerializer = new XMLValidationReportSerializer();
        ps.println("<report>");

        // Human readable error message.
        if(msg != null) {
            ps.printf("<message>%s</message>\n", msg);
        } else {
            ps.print("<message/>\n");
        }

        // Error stack trace.
        if(e != null) {
            ps.println("<error>");
            ps.println("<![CDATA[");
            e.printStackTrace(ps);
            ps.println("]]>");
            ps.println("</error>");
        } else {
            ps.println("<error/>");
        }

        // Issue Report.
        printIssueReport(er, ps);

        // Validation report.
        try {
            reportSerializer.serialize(er.getValidationReport(), ps);
        } catch (SerializationException se) {
            ps.println("An error occurred while serializing error.");
            se.printStackTrace(ps);
        }
        ps.println("</report>");
View Full Code Here

        fw.setAnnotated(annotate);
        outputMediaType = factory.getMimeType();
        List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>();
        tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
        tripleHandlers.add(new CountingTripleHandler());
        rdfWriter = new CompositeTripleHandler(tripleHandlers);
        reporter = new ReportingTripleHandler(rdfWriter);
        rdfWriter = new IgnoreAccidentalRDFa(
            new IgnoreTitlesOfEmptyDocuments(reporter),
            true    // suppress stylesheet triples.
        );
View Full Code Here

    private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
        baos = new ByteArrayOutputStream();
        rdfxmlWriter = new RDFXMLWriter(baos);
        repositoryWriter = new RepositoryWriter(conn);

        final CompositeTripleHandler cth = new CompositeTripleHandler();
        cth.addChild(rdfxmlWriter);
        cth.addChild(repositoryWriter);

        final ModifiableConfiguration configuration = DefaultConfiguration.copy();
        configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
        SingleDocumentExtraction instance =  new SingleDocumentExtraction(
                configuration,
View Full Code Here

TOP

Related Classes of org.apache.any23.extractor.html.HTMLDocument.TextField

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.