Package com.apress.progwt.server.lucene

Source Code of com.apress.progwt.server.lucene.HTMLAnalyzer

/*
* Copyright 2008 Jeff Dwyer
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.apress.progwt.server.lucene;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.compass.core.converter.ConversionException;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.util.ParserException;

/**
* NOTE: Unused. Created during some confusion over the role of compass
* Converters vs Analyzers
*
*
* @author Jeff Dwyer
*
*/
public class HTMLAnalyzer extends StandardAnalyzer {
    private static final Logger log = Logger
            .getLogger(HTMLAnalyzer.class);

    @Override
    public TokenStream tokenStream(String name, Reader reader) {

        if (log.isDebugEnabled()) {
            TokenStream ts = super.tokenStream(name,
                    htmlReaderFromReader(reader));
            Token t;
            if (log.isDebugEnabled()) {
                try {
                    while ((t = ts.next()) != null) {
                        log.debug("token: " + t);
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        return super.tokenStream(name, htmlReaderFromReader(reader));
    }

    private Reader htmlReaderFromReader(Reader reader)
            throws ConversionException {

        StringBuilder stringBuilder = new StringBuilder();

        try {
            BufferedReader br = new BufferedReader(reader);

            String s;
            while ((s = br.readLine()) != null) {
                stringBuilder.append(s);
            }

            Lexer l = new Lexer(stringBuilder.toString());
            Parser parser = new Parser(l);
            StringBean sb = new StringBean();

            parser.visitAllNodesWith(sb);

            String ret = sb.getStrings();
            return new StringReader(ret);
        } catch (ParserException e) {
            log.warn("Conversion Exception: " + e);
            throw new ConversionException(e.getMessage());
        } catch (IOException e2) {
            log.warn("Conversion Exception: " + e2);
            throw new ConversionException(e2.getMessage());
        }

    }
}
TOP

Related Classes of com.apress.progwt.server.lucene.HTMLAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.