Package com.bericotech.clavin.index

Source Code of com.bericotech.clavin.index.IndexDirectoryBuilder

package com.bericotech.clavin.index;

import static java.util.concurrent.TimeUnit.MILLISECONDS;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.bericotech.clavin.gazetteer.GeoName;

/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
* ---------------------------------------------------
*
* Copyright (C) 2012-2013 Berico Technologies
* http://clavin.bericotechnologies.com
*
* ====================================================================
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* ====================================================================
*
* IndexDirectoryBuilder.java
*
*###################################################################*/

/**
* Builds a Lucene index of geographic entries based on
* the GeoNames gazetteer.
*
* This program is run one-time before CLAVIN can be used.
*
*/
public class IndexDirectoryBuilder {
   
    public final static Logger logger = LoggerFactory.getLogger(IndexDirectoryBuilder.class);
   
    // the GeoNames gazetteer file to be loaded
    static String pathToGazetteer = "./allCountries.txt";

    /**
     * Turns a GeoNames gazetteer file into a Lucene index, and adds
     * some supplementary gazetteer records at the end.
     *
     * @param args              not used
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
       
        // exit gracefully if the directory exists
        File idir = new File("./IndexDirectory");
        if (idir.exists() ) {
            logger.info("IndexDirectory exists. Remove the directory and try again.");
            System.exit(-1);
        }
       
        logger.info("Indexing... please wait.");
       
        // Create a new index file on disk, allowing Lucene to choose
        // the best FSDirectory implementation given the environment.
        FSDirectory index = FSDirectory.open(idir);
       
        // indexing by lower-casing & tokenizing on whitespace
        Analyzer indexAnalyzer = new WhitespaceLowerCaseAnalyzer();
       
        // create the object that will actually build the Lucene index
        IndexWriter indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_40, indexAnalyzer));
       
        // open the gazetteer files to be loaded
        BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(new File(pathToGazetteer)), "UTF-8"));
        BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./src/main/resources/SupplementaryGazetteer.txt")), "UTF-8"));
       
        String line;
       
        // let's see how long this takes...
        Date start = new Date();
       
        // load GeoNames gazetteer into Lucene index
        int count = 0;
        while ((line = r.readLine()) != null)
            try {
                count += 1;
                // print progress update to console
                if (count % 100000 == 0 ) logger.info("rowcount: " + count);
                addToIndex(indexWriter, line);
            } catch (Exception e) {
                 logger.info("Skipping... Error on line:" + line);
            }

        // add supplementary gazetteer records to index
        while ((line = r2.readLine()) != null)
            addToIndex(indexWriter, line);
       
        // that wasn't so long, was it?
        Date stop = new Date();
       
        logger.info("[DONE]");
        logger.info(indexWriter.maxDoc() + " geonames added to index.");
        logger.info("Merging indices... please wait.");
       
        indexWriter.close();
        index.close();
        r.close();
        r2.close();
       
        logger.info("[DONE]");
       
        DateFormat df = new SimpleDateFormat("HH:mm:ss");
        long elapsed_MILLIS = stop.getTime() - start.getTime();
        logger.info("Process started: " + df.format(start) + ", ended: " + df.format(stop)
                + "; elapsed time: " + MILLISECONDS.toSeconds(elapsed_MILLIS) + " seconds.");
    }
   
    /**
     * Adds entries to the Lucene index for each unique name associated
     * with a {@link GeoName} object.
     *
     * @param indexWriter   the object that actually builds the Lucene index
     * @param geonameEntry  single record from GeoNames gazetteer
     * @throws IOException
     */
    private static void addToIndex(IndexWriter indexWriter, String geonameEntry) throws IOException {
        // create a GeoName object from a single gazetteer record
        GeoName geoname = GeoName.parseFromGeoNamesRecord(geonameEntry);
       
        // add the primary (UTF-8) name for this location
        if (geoname.name.length() > 0)
            indexWriter.addDocument(buildDoc(geoname.name, geonameEntry, geoname.geonameID, geoname.population));
       
        // add the ASCII name if it's different from the primary name
        if (geoname.asciiName.length() > 0 && !geoname.asciiName.equals(geoname.name))
            indexWriter.addDocument(buildDoc(geoname.asciiName, geonameEntry, geoname.geonameID, geoname.population));
       
        // add alternate names (if any) if they differ from the primary
        // and alternate names
        for (String altName : geoname.alternateNames)
            if (altName.length() > 0 && !altName.equals(geoname.name) && !altName.equals(geoname.name))
                indexWriter.addDocument(buildDoc(altName, geonameEntry, geoname.geonameID, geoname.population));
    }
   
    /**
     * Builds a Lucene document to be added to the index based on a
     * specified name for the location and the corresponding
     * {@link GeoName} object.
     *
     * @param name          name to serve as index key
     * @param geonameEntry  string from GeoNames gazetteer
     * @param geonameID     unique identifier (for quick look-up)
     * @param population    number of inhabitants (used for scoring)
     * @return              document to be added to the index
     */
    public static Document buildDoc(String name, String geonameEntry, int geonameID, Long population) {
        // in case you're wondering, yes, this is a non-standard use of
        // the Lucene Document construct :)
        Document doc = new Document();
       
        // this is essentially the key we'll try to match location
        // names against
        doc.add(new TextField("indexName", name, Field.Store.YES));
       
        // this is the payload we'll return when matching location
        // names to gazetteer records
        doc.add(new StoredField("geoname", geonameEntry));
       
        // TODO: use geonameID to link administrative subdivisions to
        //       each other
        doc.add(new IntField("geonameID", geonameID, Field.Store.YES));
       
        // we'll initially sort match results based on population
        doc.add(new LongField("population", population, Field.Store.YES));
       
        logger.debug("Adding to index: " + name);
       
        return doc;
    }

}
TOP

Related Classes of com.bericotech.clavin.index.IndexDirectoryBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.