Package edu.ucla.sspace.text

Examples of edu.ucla.sspace.text.StringDocument


                // process it as a single document.  For consistency, strip off
                // the USENET threading formatting, e.g. >>>, from the front of
                // each line.
                while ((line = usenetReader.readLine()) != null) {
                    if (line.contains(END_OF_DOCUMENT))
                        return new StringDocument(cleanDoc(content.toString()));
                    else {
                        int lineStart = 0;
                        // Find the first non '>' or ' ' in the line to
                        // determine where the auto-threading formatting stops.
                        for (char c = line.charAt(lineStart);
View Full Code Here


                        context.append("|||| ");
                        tokens[i] = lemma;
                    }
                    context.append(tokens[i]).append(" ");
                }
                contexts.add(new StringDocument(context.toString()));
                context.setLength(0);
            }
        }
View Full Code Here

                inHead = false;
            else if (name.equals("lexelt"))
                inLexElement = false;
            else if (name.equals("context")) {
                inContext = false;
                contexts.add(new StringDocument(context.toString()));
                context.setLength(0);
            }
        }
View Full Code Here

                        // that substring.
                        if (endIndex > startIndex) {
                            String extractedContent =
                                line.substring(startIndex, endIndex);
                            extractedContent = cleanDoc(extractedContent);
                            return new StringDocument(extractedContent);
                        }
                        // Otherwise create a new builder and everything
                        // appearing after the content tag.
                        else  {
                            content = new StringBuilder(line.substring(
                                        startIndex));
                            inContent = true;
                        }
                    } else if (line.contains("</content>")) {
                        inContent = false;
                        // If this is the end of the content, extract everything
                        // before it and return the total amount of text
                        // extracted.
                        int endIndex = line.lastIndexOf("<");
                        content.append(line.substring(0, endIndex));

                        return new StringDocument(cleanDoc(content.toString()));
                    } else if (line.contains("<updated>") && content != null) {
                        // When the line has an updated tag and content is not
                        // null, we need to extract the date time and prepend it
                        // to the content.
                        int startIndex = line.indexOf(">")+1;
                        int endIndex = line.lastIndexOf("<");
                        String date = line.substring(startIndex, endIndex);
                        long dateTime = date.equals("")
                            ? 0 :
                            Timestamp.valueOf(date).getTime();
                        String doc = String.format(
                                "%d %s", dateTime,
                                cleanDoc(content.toString()));
                        return new StringDocument(doc);
                    } else if (inContent && content != null) {
                        // If the content builder has been created, we know this
                        // line contains content.  Add it to the builder.
                        content.append(line);
                    }
View Full Code Here

                    addTextFromUtterance((Element) utterances.item(i),
                                         utteranceBuilder);
                    utteranceBuilder.append(". ");
                }
            }
            return new StringDocument(utteranceBuilder.toString());
        }
View Full Code Here

        /**
         * {@inheritDoc}
         */
        public synchronized Document next() {
            Document doc = new StringDocument(currentDoc);
            currentDoc = advance();
            return doc;
        }
View Full Code Here

        /**
         * {@inheritDoc}
         */
        public Document next() {
            Document doc = new StringDocument(next);
            next = advance();
            return doc;
        }
View Full Code Here

        /**
         * {@inheritDoc}
         */
        public Document next() {
            Document doc = new StringDocument(next);
            next = advance();
            return doc;
        }
View Full Code Here

                              Arrays.asList(expectedRelations)));
    }

    @Test public void testSingleExtraction() throws Exception {
        DependencyExtractor extractor = new CoNLLDependencyExtractor();
        Document doc = new StringDocument(toTabs(SINGLE_PARSE));
        DependencyTreeNode[] nodes = extractor.readNextTree(doc.reader());

        assertEquals(12, nodes.length);

        // Check the basics of the node.
        assertEquals("review", nodes[8].word());
View Full Code Here

                    Arrays.asList(expectedRelations)));
    }

    @Test public void testDoubleExtraction() throws Exception {
        DependencyExtractor extractor = new CoNLLDependencyExtractor();
        Document doc = new StringDocument("\n\n" +
                                          toTabs(SINGLE_PARSE) +
                                          "\n" +
                                          toTabs(SECOND_PARSE));
        BufferedReader reader = doc.reader();
        DependencyTreeNode[] relations = extractor.readNextTree(reader);
        assertTrue(relations != null);
        assertEquals(12, relations.length);

        testFirstRoot(relations, 2);
View Full Code Here

TOP

Related Classes of edu.ucla.sspace.text.StringDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.