Package no.priv.garshol.duke

Source Code of no.priv.garshol.duke.ConfigurationImpl

package no.priv.garshol.duke;

import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;

import no.priv.garshol.duke.utils.Utils;
import no.priv.garshol.duke.utils.ObjectUtils;

/**
* Holds the configuration details for a dataset.
*/
public class ConfigurationImpl implements Configuration {

  // there are two modes: deduplication and record linkage. in
  // deduplication mode all sources are in 'datasources'. in record
  // linkage mode they are in 'group1' and 'group2'. couldn't think
  // of a better solution. sorry.
  private Collection<DataSource> datasources;
  private Collection<DataSource> group1;
  private Collection<DataSource> group2;

  private double threshold;
  private double thresholdMaybe;

  private Map<String, Property> properties;
  private List<Property> proplist; // duplicate to preserve order
  private Collection<Property> lookups; // subset of properties

  private Database database1;
  private Database database2; // used for record linkage, if necessary

  private List<Comparator> customComparators;

  public ConfigurationImpl() {
    this.datasources = new ArrayList();
    this.group1 = new ArrayList();
    this.group2 = new ArrayList();
    this.customComparators = new ArrayList<Comparator>();
  }

  /**
   * Returns the data sources to use (in deduplication mode; don't use
   * this method in record linkage mode).
   */
  public Collection<DataSource> getDataSources() {
    return datasources;
  }

  /**
   * Returns the data sources belonging to a particular group of data
   * sources. Data sources are grouped in record linkage mode, but not
   * in deduplication mode, so only use this method in record linkage
   * mode.
   */
  public Collection<DataSource> getDataSources(int groupno) {
    if (groupno == 1)
      return group1;
    else if (groupno == 2)
      return group2;
    else
      throw new DukeConfigException("Invalid group number: " + groupno);
  }

  /**
   * Adds a data source to the configuration. If in deduplication mode
   * groupno == 0, otherwise it gives the number of the group to which
   * the data source belongs.
   */
  public void addDataSource(int groupno, DataSource datasource) {
    // the loader takes care of validation
    if (groupno == 0)
      datasources.add(datasource);
    else if (groupno == 1)
      group1.add(datasource);
    else if (groupno == 2)
      group2.add(datasource);
  }

  public Database getDatabase(boolean overwrite) {
    return getDatabase(1, overwrite);
  }

  public Database getDatabase(int groupno, boolean overwrite) {
    Database thedb;
    if (groupno == 1) {
      if (database1 == null) // not set, so use default
        database1 = new no.priv.garshol.duke.databases.LuceneDatabase();
      thedb = database1;
    } else if (groupno == 2)
      thedb = database2; // no default for no 2
    else
      throw new DukeException("Can only have two databases");

    if (thedb != null) {
      thedb.setConfiguration(this);
      thedb.setOverwrite(overwrite); // hmmm?
    }
    return thedb;
  }

  public void addDatabase(Database database) {
    if (database1 == null)
      database1 = database;
    else if (database2 == null)
      database2 = database;
    else
      throw new DukeConfigException("Too many database objects configured");
  }

  /**
   * The probability threshold used to decide whether two records
   * represent the same entity. If the probability is higher than this
   * value, the two records are considered to represent the same
   * entity.
   */
  public double getThreshold() {
    return threshold;
  }

  /**
   * Sets the probability threshold for considering two records
   * equivalent.
   */
  public void setThreshold(double threshold) {
    this.threshold = threshold;
  }

  /**
   * The probability threshold used to decide whether two records may
   * represent the same entity. If the probability is higher than this
   * value, the two records are considered possible matches. Can be 0,
   * in which case no records are considered possible matches.
   */
  public double getMaybeThreshold() {
    return thresholdMaybe;
  }

  /**
   * Returns true iff we are in deduplication mode.
   */
  public boolean isDeduplicationMode() {
    return !getDataSources().isEmpty();
  }

  /**
   * Sets the probability threshold for considering two records
   * possibly equivalent. Does not have to be set.
   */
  public void setMaybeThreshold(double thresholdMaybe) {
    this.thresholdMaybe = thresholdMaybe;
  }

  /**
   * The set of properties Duke is to work with.
   */
  public void setProperties(List<Property> props) {
    this.proplist = props;
    this.properties = new HashMap(props.size());
    for (Property prop : props)
      properties.put(prop.getName(), prop);

    // analyze properties to find lookup set
    findLookupProperties();
  }

  /**
   * The set of properties Duke records can have, and their associated
   * cleaners, comparators, and probabilities.
   */
  public List<Property> getProperties() {
    return proplist;
  }

  /**
   * The properties which are used to identify records, rather than
   * compare them.
   */
  public Collection<Property> getIdentityProperties() {
    Collection<Property> ids = new ArrayList();
    for (Property p : getProperties())
      if (p.isIdProperty())
        ids.add(p);
    return ids;
  }

  /**
   * Returns the property with the given name, or null if there is no
   * such property.
   */
  public Property getPropertyByName(String name) {
    return properties.get(name);
  }

  /**
   * Returns the properties Duke queries for in the Lucene index. This
   * is a subset of getProperties(), and is computed based on the
   * probabilities and the threshold.
   */
  public Collection<Property> getLookupProperties() {
    return lookups;
  }

  /**
   * Validates the configuration to verify that it makes sense.
   * Rejects configurations that will fail during runtime.
   */
  public void validate() {
    // verify that we do have properties
    if (properties == null || properties.isEmpty())
      throw new DukeConfigException("Configuration has no properties at all");

    // check if max prob is below threshold
    // this code duplicates code in findLookupProperties(), but prefer
    // that to creating an attribute
    double prob = 0.5;
    for (Property prop : properties.values()) {
      if (prop.getHighProbability() == 0.0)
        // if the probability is zero we ignore the property entirely
        continue;

      prob = Utils.computeBayes(prob, prop.getHighProbability());
    }
    if (prob < threshold)
      throw new DukeConfigException("Maximum possible probability is " + prob +
                                 ", which is below threshold (" + threshold +
                                 "), which means no duplicates will ever " +
                                 "be found");

    // check that we have at least one ID property
    if (getIdentityProperties().isEmpty())
      throw new DukeConfigException("No ID properties.");
  }

  private void findLookupProperties() {
    List<Property> candidates = new ArrayList();
    for (Property prop : properties.values())
      // leave out properties that are either not used for comparisons,
      // or which have lookup turned off explicitly
      if (!prop.isIdProperty() &&
          !prop.isIgnoreProperty() &&
          prop.getLookupBehaviour() != Property.Lookup.FALSE &&
          prop.getHighProbability() != 0.0)
        candidates.add(prop);


    // sort them, lowest high prob to highest high prob
    Collections.sort(candidates, new HighComparator());

    // run over and find all those needed to get above the threshold
    int last = -1;
    double prob = 0.5;
    for (int ix = 0; ix < candidates.size(); ix++) {
      Property prop = candidates.get(ix);
      prob = Utils.computeBayes(prob, prop.getHighProbability());
      if (prob >= threshold) {
        last = ix;
        break;
      }
    }

    if (last == -1)
      lookups = new ArrayList();
    else
      lookups = new ArrayList(candidates.subList(0, last + 1));


    // need to also add TRUE and REQUIRED
    for (Property p : proplist) {
      if (p.getLookupBehaviour() != Property.Lookup.TRUE &&
          p.getLookupBehaviour() != Property.Lookup.REQUIRED)
        continue;

      if (lookups.contains(p))
        continue;

      lookups.add(p);
    }
  }

  private static class HighComparator implements java.util.Comparator<Property> {
    public int compare(Property p1, Property p2) {
      if (p1.getHighProbability() < p2.getHighProbability())
        return 1;
      else if (p1.getHighProbability() == p2.getHighProbability())
        return 0;
      else
        return -1;
    }
  }

  public Configuration copy() {
    ConfigurationImpl copy = new ConfigurationImpl();
    for (DataSource src : datasources)
      copy.addDataSource(0, src);
    for (DataSource src : group1)
      copy.addDataSource(1, src);
    for (DataSource src : group2)
      copy.addDataSource(2, src);

    copy.setThreshold(threshold);
    copy.setMaybeThreshold(thresholdMaybe);
    copy.addDatabase(database1);
    if (database2 != null)
      copy.addDatabase(database2);

    List<Property> newprops = new ArrayList();
    for (Property p : proplist)
      newprops.add(p.copy());
    copy.setProperties(newprops);

    return copy;
  }


  @Override
  public List<Comparator> getCustomComparators() {
  return this.customComparators;
  }

  @Override
  public void addCustomComparator(Comparator comparator) {
  this.customComparators.add(comparator);
  }
}
TOP

Related Classes of no.priv.garshol.duke.ConfigurationImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.