Package org.archive.nutch.trec

Source Code of org.archive.nutch.trec.TRECFeaturesExtractor

package org.archive.nutch.trec;

import org.apache.lucene.document.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.rankers.IRankingFunction;

import java.io.*;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.searcher.NutchBean;
import org.apache.nutch.searcher.Hits;
import org.apache.hadoop.io.LongWritable;

import org.apache.nutch.searcher.basic.BasicQueryFilter;
import org.apache.nutch.searcher.NutchBean;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.HitDetails;
import org.apache.nutch.indexer.NutchSimilarity;
import org.apache.nutch.analysis.CommonGrams;


/**
* Extract TREC qrels and Lucene ranking results (from explanation)
* @author Miguel Costa
*/
public class TRECFeaturesExtractor extends NutchBean {
 
    private HashMap<String,String> hmap=null;
 
    /**
    * Constructor
    * @param conf
    * @param path search-servers.txt dir
    * @param boostsFile boosts file
    * @throws IOException
    */
    public TRECFeaturesExtractor(Configuration conf, Path path) throws IOException {
      super(conf,path);
     
      hmap=new HashMap<String,String>();               
    }
       
    /**
     * Read queries from TREC benchmark GOV1
     */
    private void readQueries(String filename) throws Exception {
      BufferedReader in = new BufferedReader(new FileReader(filename));
      String str;
      String qid=null;
      String queryStr=null;
      while ((str = in.readLine()) != null) {
           
        if (str.startsWith("<num> Number:")) {
          qid = str.substring("<num> Number: WT04-:".length()-1);
        }
             else if (str.startsWith("<title>")) {             
               queryStr = str.substring("<title>".length()+1)
               hmap.put(qid, queryStr);                 
              
               qid=null;
               queryStr=null;                  
             }           
      }       
      in.close();
   
   
    /**
     * Get query
     * @param qid query id
     */
    private String getQuery(String qid) {
      return hmap.get(qid);
    }
   
   
    /**
     * Get ranking values from explanation structure
     * @param expl
     * @param tab tabulator
     * @param featuresBoosts boosts ofr each feature
     */
    /*
    private static float[] getValuesFromExplanation(Explanation expl, String tab, float featuresBoosts[]) {
               
        if (expl.getDescription().startsWith("weight(url:")) {
          featuresBoosts[0]+=expl.getValue();
        }
        else if (expl.getDescription().startsWith("weight(anchor:")) {
          featuresBoosts[1]+=expl.getValue();
        }
        else if (expl.getDescription().startsWith("weight(content:")) {
          featuresBoosts[2]+=expl.getValue();
        }
        else if (expl.getDescription().startsWith("weight(title:")) {
          featuresBoosts[3]+=expl.getValue();
        }
        else if (expl.getDescription().startsWith("weight(host:")) {
          featuresBoosts[4]+=expl.getValue();
        }
       
        Explanation explDetails[]=expl.getDetails();
        if (explDetails!=null) {
          for (int i=0;i<explDetails.length;i++) {       
            getValuesFromExplanation(explDetails[i],tab+" ",featuresBoosts);
          }
        }
       
        return featuresBoosts;
    }
    */

 
    /**
     * Main
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        String usage = "org.archive.nutch.trec.TRECFeaturesExtractor <qrels file> <index dir> <queries file> <debug>";
           
        if (args.length < 4) {
          System.err.println(usage);
          System.exit(-1);
        }              
       
        try {
          Configuration conf = NutchConfiguration.create();
          conf.set("arquivo.include.types","html|xhtml+xml|xml|pdf|postscript|text|msword|vnd.ms-powerpoint|rtf|richtext"); // at nutch-site.xml
       
          TRECFeaturesExtractor extractorBean=new TRECFeaturesExtractor(conf,new Path(args[1]));
          Query queryInput = null;         
          BooleanQuery queryOutput = null;
          TermQuery queryDocno = null;
          boolean debug=Boolean.parseBoolean(args[3]);
         
          /*
          BasicQueryFilter basicFilter=new BasicQueryFilter();         
          basicFilter.setConf(conf); // must be before set boosts         
          basicFilter.setUrlBoost(1);
          basicFilter.setAnchorBoost(1);
          basicFilter.setContentBoost(1);
          basicFilter.setTitleBoost(1);
          basicFilter.setHostBoost(1);
          basicFilter.setPhraseBoost(1);
          basicFilter.setSlop(3);  // constant
          */
                   
          extractorBean.readQueries(args[2]);
                           
          Directory idx = FSDirectory.getDirectory(args[1], false);
          org.apache.lucene.search.Searcher searcher = new IndexSearcher(idx);
          org.apache.lucene.index.IndexReader reader=IndexReader.open(idx);
          searcher.setSimilarity(new NutchSimilarity()); // line ignored; ranking is hard coded in lucene                 
         
            BufferedReader in = new BufferedReader(new FileReader(args[0]));
            String str;
            String splitStr[]=null;
            String qid=null;      
            String docno=null;
            String srel=null;
           
            ArquivoWebFunctionsWritable functions=new ArquivoWebFunctionsWritable();           
        for (int i=0;i<IRankingFunction.functions.length;i++) { // all functions with boost 1
          functions.addFunction(i,1);
        }
                   
            while ((str = in.readLine()) != null) {                           
              splitStr=str.split(" ");
              qid = splitStr[0];
              docno = splitStr[2];
              srel = splitStr[3];                                            
             
              // build query
              queryInput = Query.parse(extractorBean.getQuery(qid), conf);
                                                     
              queryOutput=new BooleanQuery();                                                   
              //basicFilter.filter(queryInput, queryOutput);
              extractorBean.buildQuery(queryInput,queryOutput,conf);
              queryDocno=new TermQuery(new Term("DOCNO",docno));
              queryOutput.add(queryDocno, BooleanClause.Occur.MUST);
             
              if (debug) {
            System.out.println("Query:"+queryOutput);
          }
             
              /*
              org.apache.lucene.search.Hits hits = searcher.search(queryOutput);  // limited to 1000                                                
              int length = hits.length();
              */
             
              int length=0;
              int firstDoc=0;
          ArquivoWebScorer scorer=new ArquivoWebScorer(queryOutput, searcher, reader, functions);
            while (scorer.next()) {
              if (length==0) {
                firstDoc=scorer.doc();
              }
            length++;
          }
           
              /*
              Hits hits = extractorBean.search(queryInput, 10, -1, 0, "site", null, false, boostParams);         
              int length = (int)Math.min(hits.getLength(), 10);                
          */
         
          if (length>1) { // sanity check
            throw new Exception("Unexpected length:"+length);
          }       
          if (length<=0) {           
            System.out.println("rel:"+srel+" qid:"+qid+" docno:"+docno+" NOT FOUND")
          }
          else if (length==1) {           
            //System.out.println(qid+" "+queryInput+" "+docno);
            //float featuresBoosts[]=new float[6];
            //featuresBoosts=getValuesFromExplanation(searcher.explain(queryOutput,hits.id(0)),"",featuresBoosts);                     
         
            System.out.print("rel:"+srel+" qid:"+qid+" docno:"+docno+" ");
            //Explanation explDetails[]=searcher.explain(queryOutput,hits.id(0)).getDetails();
            Explanation expl[]=scorer.explain(firstDoc).getDetails()
            Explanation expl2[];
            if (expl!=null) {
              for (int i=0;i<expl.length;i++) {       
                System.out.print(expl[i].getDescription()+":"+expl[i].getValue()+" ");
                expl2=expl[i].getDetails();
                if (expl2!=null) {
                  for (int j=0;j<expl2.length;j++) {
                    System.out.print(expl2[j].getDescription()+":"+expl2[j].getValue()+" ");
                  }               
                }
              }
            } 
            System.out.println();
           
            /*
            for (int i=0;i<featuresBoosts.length;i++) {
              System.out.print((i+1)+":"+featuresBoosts[i]+" ");
            }
            */
            //System.out.print("expl:"+searcher.explain(queryOutput,hits.id(0))+" ");         
            //System.out.println("#docid = "+docno);
            /*
            HitDetails details=extractorBean.getDetails(hits.getHit(0));
            System.out.println(details.toHtml());
            */       
          }
          //System.out.println("length:"+length);
                   
            }
            in.close();                   
        }
        catch (IOException e) {
          System.err.println("Problem reading file: "+e.getMessage());
          e.printStackTrace();
          System.err.println(usage);
        }
        catch (Exception e) {
          System.err.println("Problem: "+e.getMessage());
          e.printStackTrace();
          System.err.println(usage);
        }
    }
   
   
   
    private static final String[] FIELDS = { "url", "anchor", "content", "title", "host" };     
    //private float[] FIELD_BOOSTS = new float[5];
   
    /**
     * Build boolean query
     * @param input
     * @param output
     */
    private void buildQuery(Query input, BooleanQuery output, Configuration conf) {
        Clause[] clauses = input.getClauses();
        for (int i = 0; i < clauses.length; i++) {
          Clause c = clauses[i];

          if (!c.getField().equals(Clause.DEFAULT_FIELD))
            continue;                                 // skip non-default fields
                 
          String[] sterms = null;
          if (c.isPhrase()) {
            //sterms = new CommonGrams(conf).optimizePhrase(c.getPhrase(), FIELDS[f]);         
            sterms = new String[c.getPhrase().getTerms().length];
            for (int j=0; j<sterms.length; j++) {
              sterms[j]=c.getPhrase().getTerms()[j].toString();
            }
          }
          else {
            sterms = new String[1];
            sterms[0]=c.getTerm().toString();
          }

          for (int j=0; j<sterms.length; j++) {   
            BooleanQuery out = new BooleanQuery();
            for (int f=0; f<FIELDS.length; f++) {                                     
              out.add(new TermQuery(new Term(FIELDS[f], sterms[j])), BooleanClause.Occur.SHOULD);
            }
            output.add(out, (c.isProhibited()
                      ? BooleanClause.Occur.MUST_NOT
                      : (c.isRequired()
                          ? BooleanClause.Occur.MUST
                          : BooleanClause.Occur.SHOULD
                        )));
          }         
        }       
    }
  
}
TOP

Related Classes of org.archive.nutch.trec.TRECFeaturesExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.