/*
* LogAnalyser.java
*
* Version: $Revision: 4735 $
*
* Date: $Date: 2010-02-01 23:11:43 +0000 (Mon, 01 Feb 2010) $
*
* Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the DSpace Foundation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
package org.dspace.app.statistics;
import org.dspace.app.statistics.LogLine;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Context;
import org.dspace.core.LogManager;
import org.dspace.storage.rdbms.DatabaseManager;
import org.dspace.storage.rdbms.TableRow;
import java.sql.SQLException;
import java.lang.Long;
import java.lang.StringBuffer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.StringTokenizer;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
/**
* This class performs all the actual analysis of a given set of DSpace log
* files. Most input can be configured; use the -help flag for a full list
* of usage information.
*
* The output of this file is plain text and forms an "aggregation" file which
* can then be used for display purposes using the related ReportGenerator
* class.
*
* @author Richard Jones
*/
public class LogAnalyser
{
// set up our class globals
// FIXME: there are so many of these perhaps they should exist in a static
// object of their own
/////////////////
// aggregators
/////////////////
/** aggregator for all actions performed in the system */
private static Map actionAggregator;
/** aggregator for all searches performed */
private static Map searchAggregator;
/** aggregator for user logins */
private static Map userAggregator;
/** aggregator for item views */
private static Map itemAggregator;
/** aggregator for current archive state statistics */
private static Map archiveStats;
/** warning counter */
private static int warnCount = 0;
/** log line counter */
private static int lineCount = 0;
//////////////////
// config data
//////////////////
/** list of actions to be included in the general summary */
private static List generalSummary;
/** list of words not to be aggregated */
private static List excludeWords;
/** list of search types to be ignored, such as "author:" */
private static List excludeTypes;
/** list of characters to be excluded */
private static List excludeChars;
/** list of item types to be reported on in the current state */
private static List itemTypes;
/** bottom limit to output for search word analysis */
private static int searchFloor;
/** bottom limit to output for item view analysis */
private static int itemFloor;
/** number of items from most popular to be looked up in the database */
private static int itemLookup;
/** mode to use for user email display */
private static String userEmail;
/** URL of the service being analysed */
private static String url;
/** Name of the service being analysed */
private static String name;
/** Name of the service being analysed */
private static String hostName;
/** the average number of views per item */
private static int views = 0;
///////////////////////
// regular expressions
///////////////////////
/** Exclude characters regular expression pattern */
private static Pattern excludeCharRX = null;
/** handle indicator string regular expression pattern */
private static Pattern handleRX = null;
/** item id indicator string regular expression pattern */
private static Pattern itemRX = null;
/** query string indicator regular expression pattern */
private static Pattern queryRX = null;
/** collection indicator regular expression pattern */
private static Pattern collectionRX = null;
/** community indicator regular expression pattern */
private static Pattern communityRX = null;
/** results indicator regular expression pattern */
private static Pattern resultsRX = null;
/** single character regular expression pattern */
private static Pattern singleRX = null;
/** a pattern to match a valid version 1.3 log file line */
private static Pattern valid13 = null;
/** a pattern to match a valid version 1.4 log file line */
private static Pattern valid14 = null;
/** pattern to match valid log file names */
private static Pattern logRegex = null;
/** pattern to match commented out lines from the config file */
private static Pattern comment = Pattern.compile("^#");
/** pattern to match genuine lines from the config file */
private static Pattern real = Pattern.compile("^(.+)=(.+)");
/** pattern to match all search types */
private static Pattern typeRX = null;
/** pattern to match all search types */
private static Pattern wordRX = null;
//////////////////////////
// Miscellaneous variables
//////////////////////////
/** process timing clock */
private static Calendar startTime = null;
/////////////////////////
// command line options
////////////////////////
/** the log directory to be analysed */
private static String logDir = ConfigurationManager.getProperty("log.dir");
/** the regex to describe the file name format */
private static String fileTemplate = "dspace\\.log.*";
/** the config file from which to configure the analyser */
public static String configFile = ConfigurationManager.getProperty("dspace.dir") +
File.separator + "config" + File.separator +
"dstat.cfg";
/** the output file to which to write aggregation data */
private static String outFile = ConfigurationManager.getProperty("log.dir") + File.separator + "dstat.dat";
/** the starting date of the report */
private static Date startDate = null;
/** the end date of the report */
private static Date endDate = null;
/** the starting date of the report as obtained from the log files */
private static Date logStartDate = null;
/** the end date of the report as obtained from the log files */
private static Date logEndDate = null;
/** are we looking stuff up in the database */
private static boolean lookUp = false;
/**
* main method to be run from command line. See usage information for
* details as to how to use the command line flags (-help)
*/
public static void main(String [] argv)
throws Exception, SQLException
{
// first, start the processing clock
startTime = new GregorianCalendar();
// create context as super user
Context context = new Context();
context.setIgnoreAuthorization(true);
// set up our command line variables
String myLogDir = null;
String myFileTemplate = null;
String myConfigFile = null;
String myOutFile = null;
Date myStartDate = null;
Date myEndDate = null;
boolean myLookUp = false;
// read in our command line options
for (int i = 0; i < argv.length; i++)
{
if (argv[i].equals("-log"))
{
myLogDir = argv[i+1];
}
if (argv[i].equals("-file"))
{
myFileTemplate = argv[i+1];
}
if (argv[i].equals("-cfg"))
{
myConfigFile = argv[i+1];
}
if (argv[i].equals("-out"))
{
myOutFile = argv[i+1];
}
if (argv[i].equals("-help"))
{
LogAnalyser.usage();
System.exit(0);
}
if (argv[i].equals("-start"))
{
myStartDate = parseDate(argv[i+1]);
}
if (argv[i].equals("-end"))
{
myEndDate = parseDate(argv[i+1]);
}
if (argv[i].equals("-lookup"))
{
myLookUp = true;
}
}
// now call the method which actually processes the logs
processLogs(context, myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);
}
/**
* using the pre-configuration information passed here, analyse the logs
* and produce the aggregation file
*
* @param context the DSpace context object this occurs under
* @param myLogDir the passed log directory. Uses default if null
* @param myFileTemplate the passed file name regex. Uses default if null
* @param myConfigFile the DStat config file. Uses default if null
* @param myOutFile the file to which to output aggregation data. Uses default if null
* @param myStartDate the desired start of the analysis. Starts from the beginning otherwise
* @param myEndDate the desired end of the analysis. Goes to the end otherwise
* @param myLookUp force a lookup of the database
*/
public static void processLogs(Context context, String myLogDir,
String myFileTemplate, String myConfigFile,
String myOutFile, Date myStartDate,
Date myEndDate, boolean myLookUp)
throws IOException, SQLException
{
// FIXME: perhaps we should have all parameters and aggregators put
// together in a single aggregating object
// if the timer has not yet been started, then start it
startTime = new GregorianCalendar();
//instantiate aggregators
actionAggregator = new HashMap();
searchAggregator = new HashMap();
userAggregator = new HashMap();
itemAggregator = new HashMap();
archiveStats = new HashMap();
//instantiate lists
generalSummary = new ArrayList();
excludeWords = new ArrayList();
excludeTypes = new ArrayList();
excludeChars = new ArrayList();
itemTypes = new ArrayList();
// set the parameters for this analysis
setParameters(myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);
// pre prepare our standard file readers and buffered readers
FileReader fr = null;
BufferedReader br = null;
// read in the config information, throwing an error if we fail to open
// the given config file
readConfig(configFile);
// assemble the regular expressions for later use (requires the file
// template to build the regex to match it
setRegex(fileTemplate);
// get the log files
File[] logFiles = getLogFiles(logDir);
// standard loop counter
int i = 0;
// for every log file do analysis
// FIXME: it is easy to implement not processing log files after the
// dates exceed the end boundary, but is there an easy way to do it
// for the start of the file? Note that we can assume that the contents
// of the log file are sequential, but can we assume the files are
// provided in a data sequence?
for (i = 0; i < logFiles.length; i++)
{
// check to see if this file is a log file agains the global regex
Matcher matchRegex = logRegex.matcher(logFiles[i].getName());
if (matchRegex.matches())
{
// if it is a log file, open it up and lets have a look at the
// contents.
try
{
fr = new FileReader(logFiles[i].toString());
br = new BufferedReader(fr);
}
catch (IOException e)
{
System.out.println("Failed to read log file " + logFiles[i].toString());
System.exit(0);
}
// for each line in the file do the analysis
// FIXME: perhaps each section needs to be dolled out to an
// analysing class to allow pluggability of other methods of
// analysis, and ease of code reading too - Pending further thought
String line = null;
while ((line = br.readLine()) != null)
{
// get the log line object
LogLine logLine = getLogLine(line);
// if there are line segments get on with the analysis
if (logLine != null)
{
// first find out if we are constraining by date and
// if so apply the restrictions
if ((startDate != null) && (!logLine.afterDate(startDate)))
{
continue;
}
if ((endDate !=null) && (!logLine.beforeDate(endDate)))
{
break;
}
// count the number of lines parsed
lineCount++;
// if we are not constrained by date, register the date
// as the start/end date if it is the earliest/latest so far
// FIXME: this should probably have a method of its own
if (startDate == null)
{
if (logStartDate != null)
{
if (logLine.beforeDate(logStartDate))
{
logStartDate = logLine.getDate();
}
}
else
{
logStartDate = logLine.getDate();
}
}
if (endDate == null)
{
if (logEndDate != null)
{
if (logLine.afterDate(logEndDate))
{
logEndDate = logLine.getDate();
}
}
else
{
logEndDate = logLine.getDate();
}
}
// count the warnings
if (logLine.isLevel("WARN"))
{
// FIXME: really, this ought to be some kind of level
// aggregator
warnCount++;
}
// is the action a search?
if (logLine.isAction("search"))
{
// get back all the valid search words from the query
String[] words = analyseQuery(logLine.getParams());
// for each search word add to the aggregator or
// increment the aggregator's counter
for (int j = 0; j < words.length; j++)
{
// FIXME: perhaps aggregators ought to be objects
// themselves
searchAggregator.put(words[j], increment(searchAggregator, words[j]));
}
}
// is the action a login, and are we counting user logins?
if (logLine.isAction("login") && !userEmail.equals("off"))
{
userAggregator.put(logLine.getUser(), increment(userAggregator, logLine.getUser()));
}
// is the action an item view?
if (logLine.isAction("view_item"))
{
String handle = logLine.getParams();
// strip the handle string
Matcher matchHandle = handleRX.matcher(handle);
handle = matchHandle.replaceAll("");
// strip the item id string
Matcher matchItem = itemRX.matcher(handle);
handle = matchItem.replaceAll("");
handle.trim();
// either add the handle to the aggregator or
// increment its counter
itemAggregator.put(handle, increment(itemAggregator, handle));
}
// log all the activity
actionAggregator.put(logLine.getAction(), increment(actionAggregator, logLine.getAction()));
}
}
// close the file reading buffers
br.close();
fr.close();
}
}
// do we want to do a database lookup? Do so only if the start and
// end dates are null or lookUp is true
// FIXME: this is a kind of separate section. Would it be worth building
// the summary string separately and then inserting it into the real
// summary later? Especially if we make the archive analysis more complex
archiveStats.put("All Items", getNumItems(context));
for (i = 0; i < itemTypes.size(); i++)
{
archiveStats.put(itemTypes.get(i), getNumItems(context, (String) itemTypes.get(i)));
}
// now do the host name and url lookup
hostName = ConfigurationManager.getProperty("dspace.hostname").trim();
name = ConfigurationManager.getProperty("dspace.name").trim();
url = ConfigurationManager.getProperty("dspace.url").trim();
if ((url != null) && (!url.endsWith("/")))
{
url = url + "/";
}
// do the average views analysis
if (((Integer) archiveStats.get("All Items")).intValue() != 0)
{
// FIXME: this is dependent on their being a query on the db, which
// there might not always be if it becomes configurable
Double avg = new Double(
Math.ceil(
((Integer) actionAggregator.get("view_item")).intValue() /
((Integer) archiveStats.get("All Items")).intValue()));
views = avg.intValue();
}
// finally, write the output
createOutput();
return;
}
/**
* set the passed parameters up as global class variables. This has to
* be done in a separate method because the API permits for running from
* the command line with args or calling the processLogs method statically
* from elsewhere
*
* @param myLogDir the log file directory to be analysed
* @param myFileTemplate regex for log file names
* @param myConfigFile config file to use for dstat
* @param myOutFile file to write the aggregation into
* @param myStartDate requested log reporting start date
* @param myEndDate requested log reporting end date
* @param myLookUp requested look up force flag
*/
public static void setParameters(String myLogDir, String myFileTemplate,
String myConfigFile, String myOutFile,
Date myStartDate, Date myEndDate,
boolean myLookUp)
{
if (myLogDir != null)
{
logDir = myLogDir;
}
if (myFileTemplate != null)
{
fileTemplate = myFileTemplate;
}
if (myConfigFile != null)
{
configFile = myConfigFile;
}
if (myStartDate != null)
{
startDate = myStartDate;
}
if (myEndDate != null)
{
endDate = myEndDate;
}
if (myLogDir != null)
{
lookUp = myLookUp;
}
if (myOutFile != null)
{
outFile = myOutFile;
}
return;
}
/**
* generate the analyser's output to the specified out file
*/
public static void createOutput()
{
// start a string buffer to hold the final output
StringBuffer summary = new StringBuffer();
// define an iterator that will be used to go over the hashmap keys
Iterator keys = null;
// output the number of lines parsed
summary.append("log_lines=" + Integer.toString(lineCount) + "\n");
// output the number of warnings encountered
summary.append("warnings=" + Integer.toString(warnCount) + "\n");
// set the general summary config up in the aggregator file
for (int i = 0; i < generalSummary.size(); i++)
{
summary.append("general_summary=" + generalSummary.get(i) + "\n");
}
// output the host name
summary.append("server_name=" + hostName + "\n");
// output the service name
summary.append("service_name=" + name + "\n");
// output the date information if necessary
SimpleDateFormat sdf = new SimpleDateFormat("dd'/'MM'/'yyyy");
if (startDate != null)
{
summary.append("start_date=" + sdf.format(startDate) + "\n");
}
else if (logStartDate != null)
{
summary.append("start_date=" + sdf.format(logStartDate) + "\n");
}
if (endDate != null)
{
summary.append("end_date=" + sdf.format(endDate) + "\n");
}
else if (logEndDate != null)
{
summary.append("end_date=" + sdf.format(logEndDate) + "\n");
}
// write out the archive stats
keys = archiveStats.keySet().iterator();
while (keys.hasNext())
{
String key = (String) keys.next();
summary.append("archive." + key + "=" + archiveStats.get(key) + "\n");
}
// write out the action aggregation results
keys = actionAggregator.keySet().iterator();
while (keys.hasNext())
{
String key = (String) keys.next();
summary.append("action." + key + "=" + actionAggregator.get(key) + "\n");
}
// depending on the config settings for reporting on emails output the
// login information
summary.append("user_email=" + userEmail + "\n");
int address = 1;
keys = userAggregator.keySet().iterator();
// for each email address either write out the address and the count
// or alias it with an "Address X" label, to keep the data confidential
// FIXME: the users reporting should also have a floor value
while (keys.hasNext())
{
String key = (String) keys.next();
summary.append("user.");
if (userEmail.equals("on"))
{
summary.append(key + "=" + userAggregator.get(key) + "\n");
}
else if (userEmail.equals("alias"))
{
summary.append("Address " + Integer.toString(address++) + "=" + userAggregator.get(key) + "\n");
}
}
// FIXME: all values which have floors set should provide an "other"
// record which counts how many other things which didn't make it into
// the listing there are
// output the search word information
summary.append("search_floor=" + searchFloor + "\n");
keys = searchAggregator.keySet().iterator();
while (keys.hasNext())
{
String key = (String) keys.next();
if (((Integer) searchAggregator.get(key)).intValue() >= searchFloor)
{
summary.append("search." + key + "=" + searchAggregator.get(key) + "\n");
}
}
// FIXME: we should do a lot more with the search aggregator
// Possible feature list:
// - constrain by collection/community perhaps?
// - we should consider building our own aggregator class which can
// be full of rich data. Perhaps this and the Stats class should
// be the same thing.
// item viewing information
summary.append("item_floor=" + itemFloor + "\n");
summary.append("host_url=" + url + "\n");
summary.append("item_lookup=" + itemLookup + "\n");
// write out the item access information
keys = itemAggregator.keySet().iterator();
while (keys.hasNext())
{
String key = (String) keys.next();
if (((Integer) itemAggregator.get(key)).intValue() >= itemFloor)
{
summary.append("item." + key + "=" + itemAggregator.get(key) + "\n");
}
}
// output the average views per item
if (views > 0)
{
summary.append("avg_item_views=" + views + "\n");
}
// insert the analysis processing time information
Calendar endTime = new GregorianCalendar();
long timeInMillis = (endTime.getTimeInMillis() - startTime.getTimeInMillis());
summary.append("analysis_process_time=" + Long.toString(timeInMillis / 1000) + "\n");
// finally write the string into the output file
try
{
BufferedWriter out = new BufferedWriter(new FileWriter(outFile));
out.write(summary.toString());
out.flush();
out.close();
}
catch (IOException e)
{
System.out.println("Unable to write to output file " + outFile);
System.exit(0);
}
return;
}
/**
* get an array of file objects representing the passed log directory
*
* @param logDir the log directory in which to pick up files
*
* @return an array of file objects representing the given logDir
*/
public static File[] getLogFiles(String logDir)
{
// open the log files directory, read in the files, check that they
// match the passed regular expression then analyse the content
File logs = new File(logDir);
// if log dir is not a directory throw and error and exit
if (!logs.isDirectory())
{
System.out.println("Passed log directory is not a directory");
System.exit(0);
}
// get the files in the directory
return logs.listFiles();
}
/**
* set up the regular expressions to be used by this analyser. Mostly this
* exists to provide a degree of segregation and readability to the code
* and to ensure that you only need to set up the regular expressions to
* be used once
*
* @param fileTemplate the regex to be used to identify dspace log files
*/
public static void setRegex(String fileTemplate)
{
// build the exclude characters regular expression
StringBuffer charRegEx = new StringBuffer();
charRegEx.append("[");
for (int i = 0; i < excludeChars.size(); i++)
{
charRegEx.append("\\" + (String) excludeChars.get(i));
}
charRegEx.append("]");
excludeCharRX = Pattern.compile(charRegEx.toString());
// regular expression to find handle indicators in strings
handleRX = Pattern.compile("handle=");
// regular expression to find item_id indicators in strings
itemRX = Pattern.compile(",item_id=.*$");
// regular expression to find query indicators in strings
queryRX = Pattern.compile("query=");
// regular expression to find collections in strings
collectionRX = Pattern.compile("collection_id=[0-9]*,");
// regular expression to find communities in strings
communityRX = Pattern.compile("community_id=[0-9]*,");
// regular expression to find search result sets
resultsRX = Pattern.compile(",results=(.*)");
// regular expressions to find single characters anywhere in the string
singleRX = Pattern.compile("( . |^. | .$)");
// set up the standard log file line regular expression
String logLine13 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:([^:]+):(.*)";
String logLine14 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:[^:]+:([^:]+):(.*)";
valid13 = Pattern.compile(logLine13);
valid14 = Pattern.compile(logLine14);
// set up the pattern for validating log file names
logRegex = Pattern.compile(fileTemplate);
// set up the pattern for matching any of the query types
StringBuffer typeRXString = new StringBuffer();
typeRXString.append("(");
for (int i = 0; i < excludeTypes.size(); i++)
{
if (i > 0)
{
typeRXString.append("|");
}
typeRXString.append((String) excludeTypes.get(i));
}
typeRXString.append(")");
typeRX = Pattern.compile(typeRXString.toString());
// set up the pattern for matching any of the words to exclude
StringBuffer wordRXString = new StringBuffer();
wordRXString.append("(");
for (int i = 0; i < excludeWords.size(); i++)
{
if (i > 0)
{
wordRXString.append("|");
}
wordRXString.append(" " + (String) excludeWords.get(i) + " ");
wordRXString.append("|");
wordRXString.append("^" + (String) excludeWords.get(i) + " ");
wordRXString.append("|");
wordRXString.append(" " + (String) excludeWords.get(i) + "$");
}
wordRXString.append(")");
wordRX = Pattern.compile(wordRXString.toString());
return;
}
/**
* read in the given config file and populate the class globals
*
* @param configFile the config file to read in
*/
public static void readConfig(String configFile)
throws IOException
{
//instantiate aggregators
actionAggregator = new HashMap();
searchAggregator = new HashMap();
userAggregator = new HashMap();
itemAggregator = new HashMap();
archiveStats = new HashMap();
//instantiate lists
generalSummary = new ArrayList();
excludeWords = new ArrayList();
excludeTypes = new ArrayList();
excludeChars = new ArrayList();
itemTypes = new ArrayList();
// prepare our standard file readers and buffered readers
FileReader fr = null;
BufferedReader br = null;
String record = null;
try
{
fr = new FileReader(configFile);
br = new BufferedReader(fr);
}
catch (IOException e)
{
System.out.println("Failed to read config file: " + configFile);
System.exit(0);
}
// read in the config file and set up our instance variables
while ((record = br.readLine()) != null)
{
// check to see what kind of line we have
Matcher matchComment = comment.matcher(record);
Matcher matchReal = real.matcher(record);
// if the line is not a comment and is real, read it in
if (!matchComment.matches() && matchReal.matches())
{
// lift the values out of the matcher's result groups
String key = matchReal.group(1).trim();
String value = matchReal.group(2).trim();
// read the config values into our instance variables (see
// documentation for more info on config params)
if (key.equals("general.summary"))
{
actionAggregator.put(value, new Integer(0));
generalSummary.add(value);
}
if (key.equals("exclude.word"))
{
excludeWords.add(value);
}
if (key.equals("exclude.type"))
{
excludeTypes.add(value);
}
if (key.equals("exclude.character"))
{
excludeChars.add(value);
}
if (key.equals("item.type"))
{
itemTypes.add(value);
}
if (key.equals("item.floor"))
{
itemFloor = Integer.parseInt(value);
}
if (key.equals("search.floor"))
{
searchFloor = Integer.parseInt(value);
}
if (key.equals("item.lookup"))
{
itemLookup = Integer.parseInt(value);
}
if (key.equals("user.email"))
{
userEmail = value;
}
}
}
// close the inputs
br.close();
fr.close();
return;
}
/**
* increment the value of the given map at the given key by one.
*
* @param map the map whose value we want to increase
* @param key the key of the map whose value to increase
*
* @return an integer object containing the new value
*/
public static Integer increment(Map map, String key)
{
Integer newValue = null;
if (map.containsKey(key))
{
// FIXME: this seems like a ridiculous way to add Integers
newValue = new Integer(((Integer) map.get(key)).intValue() + 1);
}
else
{
newValue = new Integer(1);
}
return newValue;
}
/**
* Take the standard date string requested at the command line and convert
* it into a Date object. Throws and error and exits if the date does
* not parse
*
* @param date the string representation of the date
*
* @return a date object containing the date, with the time set to
* 00:00:00
*/
public static Date parseDate(String date)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy'-'MM'-'dd");
Date parsedDate = null;
try
{
parsedDate = sdf.parse(date);
}
catch (ParseException e)
{
System.out.println("The date is not in the correct format");
System.exit(0);
}
return parsedDate;
}
/**
* Take the date object and convert it into a string of the form YYYY-MM-DD
*
* @param date the date to be converted
*
* @return A string of the form YYYY-MM-DD
*/
public static String unParseDate(Date date)
{
// Use SimpleDateFormat
SimpleDateFormat sdf = new SimpleDateFormat("yyyy'-'MM'-'dd");
return sdf.format(date);
}
/**
* Take a search query string and pull out all of the meaningful information
* from it, giving the results in the form of a String array, a single word
* to each element
*
* @param query the search query to be analysed
*
* @return the string array containing meaningful search terms
*/
public static String[] analyseQuery(String query)
{
// register our standard loop counter
int i = 0;
// make the query string totally lower case, to ensure we don't miss out
// on matches due to capitalisation
query = query.toLowerCase();
// now perform successive find and replace operations using pre-defined
// global regular expressions
Matcher matchQuery = queryRX.matcher(query);
query = matchQuery.replaceAll(" ");
Matcher matchCollection = collectionRX.matcher(query);
query = matchCollection.replaceAll(" ");
Matcher matchCommunity = communityRX.matcher(query);
query = matchCommunity.replaceAll(" ");
Matcher matchResults = resultsRX.matcher(query);
query = matchResults.replaceAll(" ");
Matcher matchTypes = typeRX.matcher(query);
query = matchTypes.replaceAll(" ");
Matcher matchChars = excludeCharRX.matcher(query);
query = matchChars.replaceAll(" ");
Matcher matchWords = wordRX.matcher(query);
query = matchWords.replaceAll(" ");
Matcher single = singleRX.matcher(query);
query = single.replaceAll(" ");
// split the remaining string by whitespace, trim and stuff into an
// array to be returned
StringTokenizer st = new StringTokenizer(query);
String[] words = new String[st.countTokens()];
for (i = 0; i < words.length; i++)
{
words[i] = st.nextToken().trim();
}
// FIXME: some single characters are still slipping through the net;
// why? and how do we fix it?
return words;
}
/**
* split the given line into it's relevant segments if applicable (i.e. the
* line matches the required regular expression.
*
* @param line the line to be segmented
* @return a Log Line object for the given line
*/
public static LogLine getLogLine(String line)
{
// FIXME: consider moving this code into the LogLine class. To do this
// we need to much more carefully define the structure and behaviour
// of the LogLine class
Matcher match;
if (line.indexOf(":ip_addr") > 0)
{
match = valid14.matcher(line);
}
else
{
match = valid13.matcher(line);
}
if (match.matches())
{
// set up a new log line object
LogLine logLine = new LogLine(parseDate(match.group(1).trim()),
LogManager.unescapeLogField(match.group(2)).trim(),
LogManager.unescapeLogField(match.group(3)).trim(),
LogManager.unescapeLogField(match.group(4)).trim(),
LogManager.unescapeLogField(match.group(5)).trim());
return logLine;
}
else
{
return null;
}
}
/**
* get the number of items in the archive which were accessioned between
* the provided start and end dates, with the given value for the DC field
* 'type' (unqualified)
*
* @param context the DSpace context for the action
* @param type value for DC field 'type' (unqualified)
*
* @return an integer containing the relevant count
*/
public static Integer getNumItems(Context context, String type)
throws SQLException
{
boolean oracle = false;
if ("oracle".equals(ConfigurationManager.getProperty("db.name")))
{
oracle = true;
}
// FIXME: this method is clearly not optimised
// FIXME: we don't yet collect total statistics, such as number of items
// withdrawn, number in process of submission etc. We should probably do
// that
// start the type constraint
String typeQuery = null;
if (type != null)
{
typeQuery = "SELECT item_id " +
"FROM metadatavalue " +
"WHERE text_value LIKE '%" + type + "%' " +
"AND metadata_field_id = (" +
" SELECT metadata_field_id " +
" FROM metadatafieldregistry " +
" WHERE element = 'type' " +
" AND qualifier IS NULL) ";
}
// start the date constraint query buffer
StringBuffer dateQuery = new StringBuffer();
if (oracle)
{
dateQuery.append("SELECT /*+ ORDERED_PREDICATES */ item_id ");
}
else
{
dateQuery.append("SELECT item_id ");
}
dateQuery.append("FROM metadatavalue " +
"WHERE metadata_field_id = (" +
" SELECT metadata_field_id " +
" FROM metadatafieldregistry " +
" WHERE element = 'date' " +
" AND qualifier = 'accessioned') ");
if (startDate != null)
{
if (oracle)
{
dateQuery.append(" AND TO_TIMESTAMP( TO_CHAR(text_value), "+
"'yyyy-mm-dd\"T\"hh24:mi:ss\"Z\"' ) > TO_DATE('" +
unParseDate(startDate) + "', 'yyyy-MM-dd') ");
}
else
{
dateQuery.append(" AND text_value::timestamp > '" +
unParseDate(startDate) + "'::timestamp ");
}
}
if (endDate != null)
{
if (oracle)
{
dateQuery.append(" AND TO_TIMESTAMP( TO_CHAR(text_value), "+
"'yyyy-mm-dd\"T\"hh24:mi:ss\"Z\"' ) < TO_DATE('" +
unParseDate(endDate) + "', 'yyyy-MM-dd') ");
}
else
{
dateQuery.append(" AND text_value::timestamp < '" +
unParseDate(endDate) + "'::timestamp ");
}
}
// build the final query
StringBuffer query = new StringBuffer();
query.append("SELECT COUNT(*) AS num " +
"FROM item " +
"WHERE in_archive = " + (oracle ? "1 " : "true ") +
"AND withdrawn = " + (oracle ? "0 " : "false "));
if (startDate != null || endDate != null)
{
query.append(" AND item_id IN ( " +
dateQuery.toString() + ") ");
}
if (type != null)
{
query.append(" AND item_id IN ( " +
typeQuery + ") ");
}
TableRow row = DatabaseManager.querySingle(context, query.toString());
Integer numItems;
if (oracle)
{
numItems = new Integer(row.getIntColumn("num"));
}
else
{
// for some reason the number column is of "long" data type!
Long count = new Long(row.getLongColumn("num"));
numItems = new Integer(count.intValue());
}
return numItems;
}
/**
* get the total number of items in the archive at time of execution,
* ignoring all other constraints
*
* @param context the DSpace context the action is being performed in
*
* @return an Integer containing the number of items in the
* archive
*/
public static Integer getNumItems(Context context)
throws SQLException
{
return getNumItems(context, null);
}
/**
* print out the usage information for this class to the standard out
*/
public static void usage()
{
String usage = "Usage Information:\n" +
"LogAnalyser [options [parameters]]\n" +
"-log [log directory]\n" +
"\tOptional\n" +
"\tSpecify a directory containing log files\n" +
"\tDefault uses [dspace.dir]/log from dspace.cfg\n" +
"-file [file name regex]\n" +
"\tOptional\n" +
"\tSpecify a regular expression as the file name template.\n" +
"\tCurrently this needs to be correctly escaped for Java string handling (FIXME)\n" +
"\tDefault uses dspace.log*\n" +
"-cfg [config file path]\n" +
"\tOptional\n" +
"\tSpecify a config file to be used\n" +
"\tDefault uses dstat.cfg in dspace config directory\n" +
"-out [output file path]\n" +
"\tOptional\n" +
"\tSpecify an output file to write results into\n" +
"\tDefault uses dstat.dat in dspace log directory\n" +
"-start [YYYY-MM-DD]\n" +
"\tOptional\n" +
"\tSpecify the start date of the analysis\n" +
"\tIf a start date is specified then no attempt to gather \n" +
"\tcurrent database statistics will be made unless -lookup is\n" +
"\talso passed\n" +
"\tDefault is to start from the earliest date records exist for\n" +
"-end [YYYY-MM-DD]\n" +
"\tOptional\n" +
"\tSpecify the end date of the analysis\n" +
"\tIf an end date is specified then no attempt to gather \n" +
"\tcurrent database statistics will be made unless -lookup is\n" +
"\talso passed\n" +
"\tDefault is to work up to the last date records exist for\n" +
"-lookup\n" +
"\tOptional\n" +
"\tForce a lookup of the current database statistics\n" +
"\tOnly needs to be used if date constraints are also in place\n" +
"-help\n" +
"\tdisplay this usage information\n";
System.out.println(usage);
}
}