/**
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.jpedal.org
* (C) Copyright 1997-2008, IDRsolutions and Contributors.
*
* This file is part of JPedal
*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* ---------------
* PdfGroupingAlgorithms.java
* ---------------
*/
package org.jpedal.grouping;
import java.awt.Point;
import java.awt.Rectangle;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jpedal.PdfDecoder;
import org.jpedal.color.GenericColorSpace;
import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Sorts;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.Vector_Float;
import org.jpedal.utils.repositories.Vector_Int;
import org.jpedal.utils.repositories.Vector_Object;
import org.jpedal.utils.repositories.Vector_Rectangle;
import org.jpedal.utils.repositories.Vector_String;
/**
* Applies heuristics to unstructured PDF text to create content
*/
public class PdfGroupingAlgorithms {
private boolean includeHTMLtags=false;
private int wordDetectionTechnique = 0;
public static final int USER_DEFINED_LIST_ONLY = 0;
public static final int SURROUND_BY_ANY_PUNCTUATION = 1;
private static String SystemSeparator = System.getProperty("line.separator");
//public PdfGroupingAlgorithms() {}
/** ==============START OF ARRAYS================ */
/**
* content is stored in a set of arrays. We have tried various methods (ie
* create composite object, etc) and none are entirely satisfactory. The
* beauty of this method is speed.
*/
/**
* flag to show this item has been merged into another and should be
* ignored. This allows us to repeat operations on live elements without
* lots of deleting.
*/
private boolean[] isUsed;
/**
* List of punctuation to allow before or after text
* and still count as a whole word during search
* Currently supported are </br>
*/
private String punctuation = "" +
'\u003A' + // Colon
'\u005F' + // UnderScore
'\u0020' + // Space
'\u0028' + // Open Bracket
'\u0029' + // Close Bracket
'\u0021' + // Exclamation Point
'\u003B' + // Semicolon
'\u002E' + // Full Stop
'\u002C' + // Comma
'\u002F' + // Forward Slash
'\u002D' + // Dash / Minus
'\u003D' + // Equals
'\u002B' + // Plus
'\u003F' + // Question Mark
'\u005B' + // Open Square Brackets
'\u005D' + // Close Square Brackets
'\u007B' + // Open Curly Brackets
'\u007D' + // Close Curly Brackets
'\u0084' + // Double Comma
'\u0085' + // Ellipsis
'\u0093' + // Smart Double Quote Open
'\u0094' + // Smart Double Quote Close
'\u0091' + // Smart Single Quote Open
'\u0092' + // Smart Single Quote Close
'\u201C' + // Left Double Quotation Mark
'\u201D' + // Right Double Quotation Mark
"\u005C\u005C" + // Back Slash
"\u005C\u0027" + // Single Quotation mark
"\u005C\u0022"; // Double Quotation mark
/** co-ords of object (x1,y1 is top left) */
private float[] f_x1, f_x2, f_y1, f_y2;
/**track if we removed space from end*/
private boolean[] hadSpace;
/**hold colour info*/
private String[] f_colorTag;
/**hold writing mode*/
private int[] writingMode;
/**hold move type*/
private int[] moveType;
/** font sizes in pixels */
private int[] fontSize;
/** amount of space a space uses in this font/size */
private float[] spaceWidth;
/** actual text */
private StringBuffer[] content;
/** raw number of text characters */
private int[] textLength;
/** ==============END OF ARRAYS================ */
/**
* handle on page data object. We extract data from this into local arrays
* and return grouped content into object at end. This is done for speed.
*/
private PdfData pdf_data;
PdfPageData pageData;
/** flag to show if output for table is CSV or XHTML */
private boolean isXHTML = true;
/** slot to insert next value - used when we split fragments for table code */
private int nextSlot;
/** vertical breaks for table calculation */
private Vector_Int lineBreaks = new Vector_Int();
/** holds details as we scan lines for table */
private Vector_Object lines;
/** lookup table used to sort into correct order for table */
private Vector_Int lineY2;
/**
* marker char used in content (we bury location for each char so we can
* split)
*/
private static final String MARKER = PdfData.marker;
public static char MARKER2= MARKER.charAt(0);
/** counters for cols and rows and pointer to final object we merge into */
private int max_rows = 0, master = 0;
/**flag to show color info is being extracted*/
private boolean colorExtracted=false;
/** used to calculate correct order for table lines */
private int[] line_order;
/** amount we resize arrays holding content with if no space */
private final static int increment = 100;
public static boolean useUnrotatedCoords;
/**end points if text located*/
private float[] endPoints;
/**flag to show if tease created on findText*/
private boolean includeTease;
/**teasers for findtext*/
private String[] teasers;
private List multipleTermTeasers = new ArrayList();
private boolean usingMultipleTerms = false;
private boolean isXMLExtraction=true;
/*
* Variables to allow cross line search results
*/
/**Value placed between result areas to show they are part of the same result*/
private int linkedSearchAreas=-101;
/** create a new instance, passing in raw data */
public PdfGroupingAlgorithms(PdfData pdf_data, PdfPageData pageData, boolean isXMLExtraction) {
this.pdf_data = pdf_data;
this.pageData=pageData;
this.isXMLExtraction=isXMLExtraction;
colorExtracted=pdf_data.isColorExtracted();
}
public static void setSeparator(String sep){
SystemSeparator = sep;
}
/**
* workout if we should use space, CR or no separator when joining lines
*/
static final private String getLineDownSeparator(StringBuffer rawLine1, StringBuffer rawLine2, boolean isXMLExtraction) {
String returnValue = " "; //space is default
boolean hasUnderline = false;
/**get 2 lines without any XML or spaces so we can look at last char*/
StringBuffer line1,line2;
if(isXMLExtraction){
line1 = Strip.stripXML(rawLine1,isXMLExtraction);
line2 = Strip.stripXML(rawLine2,isXMLExtraction);
}else{
line1 = Strip.trim(rawLine1);
line2 = Strip.trim(rawLine2);
}
/**get lengths and if appropriate perform tests*/
int line1Len = line1.length();
int line2Len = line2.length();
//System.out.println(line1Len+" "+line2Len);
if((line1Len>1)&&(line2Len>1)){
/**get chars to test*/
char line1Char2 = line1.charAt(line1Len - 1);
char line1Char1 = line1.charAt(line1Len - 2);
char line2Char1 = line2.charAt(0);
char line2Char2 = line2.charAt(1);
//deal with hyphenation first - ignore unless :- or space-
String hyphen_values = "";
if (hyphen_values.indexOf(line1Char2) != -1) {
returnValue = ""; //default of nothing
if (line1Char1 == ':')
returnValue = "\n";
if (line1Char2 == ' ')
returnValue = " ";
//paragraph breaks if full stop and next line has ascii char or Capital Letter
} else if (
((line1Char1 == '.') | (line1Char2 == '.'))
& (Character.isUpperCase(line2Char1)
| (line2Char1 == '&')
| Character.isUpperCase(line2Char2)
| (line2Char2 == '&'))){
if(isXMLExtraction)
returnValue = "<p></p>\n";
else
returnValue="\n";
}
}
//add an underline if appropriate
if (hasUnderline){
if(isXMLExtraction)
returnValue = returnValue + "<p></p>\n";
else
returnValue=returnValue+ '\n';
}
return returnValue;
}
/**
* remove shadows from text created by double printing of text and drowned
* items where text inside other text
*/
private final void cleanupShadowsAndDrownedObjects(boolean avoidSpaces) {
//get list of items
int[] items = getUnusedFragments();
int count = items.length;
int c, n;
String separator;
float diff;
//work through objects and eliminate shadows or roll together overlaps
for (int p = 0; p < count; p++) {
//master item
c = items[p];
//ignore used items
if (isUsed[c] == false) {
//work out mid point in text
float midX = (f_x1[c] + f_x2[c]) / 2;
float midY = (f_y1[c] + f_y2[c]) / 2;
for (int p2 = p + 1;p2 < count;p2++) {
//item to test against
n = items[p2];
if ((isUsed[n] == false) && (isUsed[c] == false)) {
float fontDiff=this.fontSize[n]-fontSize[c];
if(fontDiff<0)
fontDiff=-fontDiff;
diff = (f_x2[n] - f_x1[n]) - (f_x2[c] - f_x1[c]);
if(diff<0)
diff=-diff;
/** stop spurious matches on overlapping text*/
if (fontDiff==0 && (midX > f_x1[n])&& (midX < f_x2[n])
&& (diff< 10)
&& (midY < f_y1[n])&& (midY > f_y2[n])) {
isUsed[n] = true;
//pick up drowned text items (item inside another)
} else {
boolean a_in_b =
(f_x1[n] > f_x1[c])&& (f_x2[n] < f_x2[c])
&& (f_y1[n] < f_y1[c])&& (f_y2[n] > f_y2[c]);
boolean b_in_a =
(f_x1[c] > f_x1[n])&& (f_x2[c] < f_x2[n])
&& (f_y1[c] < f_y1[n])&& (f_y2[c] > f_y2[n]);
//merge together
if (a_in_b || b_in_a) {
//get order right - bottom y2 underneath
if (f_y2[c] > f_y2[n]) {
separator =getLineDownSeparator(content[c],content[n],isXMLExtraction);
if((avoidSpaces==false)||(separator.indexOf(' ')==-1)){
merge(c,n,separator,true);
}
} else {
separator =getLineDownSeparator(content[n],content[c],isXMLExtraction);
if(!avoidSpaces || separator.indexOf(' ')==-1){
merge(n,c,separator,true);
}
}
//recalculate as may have changed
midX = (f_x1[c] + f_x2[c]) / 2;
midY = (f_y1[c] + f_y2[c]) / 2;
}
}
}
}
}
}
}
/**
* general routine to see if we add a space between 2 text fragments
*/
final private String isGapASpace(int c, int l, float actualGap,boolean addMultiplespaceXMLTag,int writingMode) {
String sep = "";
float gap;
//use smaller gap
float gapA = spaceWidth[c] * fontSize[c];
float gapB = spaceWidth[l] * fontSize[l];
if (gapA > gapB)
gap = gapB;
else
gap = gapA;
gap = (actualGap / (gap / 1000));
//Round values to closest full integer as float -> int conversion rounds down
if(gap > 0.51f && gap<1)
gap = 1;
int spaceCount = (int) gap;
if (spaceCount > 0)
sep = " ";
/** add an XML tag to flag multiple spaces */
if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
sep = " <SpaceCount space=\"" + spaceCount + "\" />";
return sep;
}
/**
* merge 2 text fragments together and update co-ordinates
*/
final private void merge(int m,int c,String separator,boolean moveFont) {
//update co-ords
if (f_x1[m] > f_x1[c])
f_x1[m] = f_x1[c];
if (f_y1[m] < f_y1[c])
f_y1[m] = f_y1[c];
if (f_x2[m] < f_x2[c])
f_x2[m] = f_x2[c];
if (f_y2[m] > f_y2[c])
f_y2[m] = f_y2[c];
if(isXMLExtraction){
String test=Fonts.fe;
//add color tag if needed and changes
if(colorExtracted)
test=Fonts.fe+GenericColorSpace.ce;
//move </Font> if needed and add separator
if ((moveFont) && (content[m].toString().lastIndexOf(test)!=-1)) {
String master = content[m].toString();
content[m] =new StringBuffer(master.substring(0, master.lastIndexOf(test)));
content[m].append(separator);
content[m].append(master.substring(master.lastIndexOf(test)));
} else{
content[m].append(separator);
}
//Only map out space if text length is longer than 1
if(textLength[c]>1 && content[m].toString().endsWith(" ")){
content[m].deleteCharAt(content[m].lastIndexOf(" "));
}
//use font size of second text (ie at end of merged text)
fontSize[m] = fontSize[c];
//Remove excess / redundent xml tags
if(content[c].indexOf("<color")!=-1 && content[m].indexOf("<color")!=-1){
if(content[c].toString().startsWith(content[m].substring(content[m].lastIndexOf("<color"), content[m].indexOf(">", content[m].lastIndexOf("<color")))) &&
content[m].lastIndexOf("</color>")+7==content[m].lastIndexOf(">")){
content[c].replace(content[c].indexOf("<color"), content[c].indexOf(">")+1, "");
content[m].replace(content[m].lastIndexOf("</color>"), content[m].lastIndexOf("</color>")+8, "");
}
}
if(content[c].indexOf("<font")!=-1 && content[m].indexOf("<font")!=-1){
if(content[c].toString().startsWith(content[m].substring(content[m].lastIndexOf("<font"), content[m].indexOf(">",content[m].lastIndexOf("<font")))) &&
content[m].lastIndexOf("</font>")+6==content[m].lastIndexOf(">")){
content[c].replace(content[c].indexOf("<font"), content[c].indexOf(">")+1, "");
content[m].replace(content[m].lastIndexOf("</font>"), content[m].lastIndexOf("</font>")+7, "");
}
}
content[m] = content[m].append(content[c]);
//track length of text less all tokens
textLength[m] = textLength[m] + textLength[c];
//set objects to null to flush and log as used
isUsed[c] = true;
content[c] = null;
}else{
//use font size of second text (ie at end of merged text)
fontSize[m] = fontSize[c];
//add together
content[m] = content[m].append(separator).append(content[c]);
//track length of text less all tokens
textLength[m] = textLength[m] + textLength[c];
//set objects to null to flush and log as used
isUsed[c] = true;
content[c] = null;
}
}
/**
* remove width data we may have buried in data
*/
final private void removeEncoding() {
// get list of items
int[] items = getUnusedFragments();
int count = items.length;
int current;
// work through objects and eliminate shadows or roll together overlaps
for (int item : items) {
// master item
current = item;
// ignore used items and remove widths we hid in data
if (isUsed[current] == false)
content[current] = removeHiddenMarkers(current);
}
}
/**
* put raw data into Arrays for quick merging breakup_fragments shows if we
* break on vertical lines and spaces
*/
final private void copyToArrays() {
colorExtracted=pdf_data.isColorExtracted();
int count = pdf_data.getRawTextElementCount();
//local lists for faster access
isUsed = new boolean[count];
fontSize = new int[count];
writingMode=new int[count];
spaceWidth = new float[count];
content = new StringBuffer[count];
textLength = new int[count];
f_x1 = new float[count];
f_colorTag=new String[count];
f_x2 = new float[count];
f_y1 = new float[count];
f_y2 = new float[count];
moveType=new int[count];
//set values
for (int i = 0; i < count; i++) {
content[i] = new StringBuffer(pdf_data.contents[i]);
fontSize[i] = pdf_data.f_end_font_size[i];
writingMode[i]=pdf_data.f_writingMode[i];
f_x1[i] = pdf_data.f_x1[i];
f_colorTag[i]=pdf_data.colorTag[i];
f_x2[i] = pdf_data.f_x2[i];
f_y1[i] = pdf_data.f_y1[i];
f_y2[i] = pdf_data.f_y2[i];
moveType[i]=pdf_data.move_command[i];
spaceWidth[i] = pdf_data.space_width[i];
textLength[i] = pdf_data.text_length[i];
}
}
/**
* get list of unused fragments and put in list
*/
private int[] getUnusedFragments() {
int total_fragments = isUsed.length;
//get unused item pointers
int ii = 0;
int temp_index[] = new int[total_fragments];
for (int i = 0; i < total_fragments; i++) {
if (isUsed[i] == false) {
temp_index[ii] = i;
ii++;
}
}
//put into correctly sized array
int[] items = new int[ii];
System.arraycopy(temp_index, 0, items, 0, ii);
return items;
}
/**
* strip the hidden numbers of position we encoded into the data
* (could be coded to be faster by not using Tokenizer)
*/
private StringBuffer removeHiddenMarkers(int c) {
//make sure has markers and ignore if not
if (content[c].indexOf(MARKER) == -1)
return content[c];
//strip the markers
StringTokenizer tokens =new StringTokenizer(content[c].toString(), MARKER, true);
String temp;
StringBuffer processedData = new StringBuffer();
//with a token to make sure cleanup works
while (tokens.hasMoreTokens()) {
//strip encoding in data
temp = tokens.nextToken(); //see if first marker
if (temp.equals(MARKER)) {
tokens.nextToken(); //point character starts
tokens.nextToken(); //second marker
tokens.nextToken(); //width
tokens.nextToken(); //third marker
//put back chars
processedData = processedData.append(tokens.nextToken());
} else
processedData = processedData.append(temp);
}
return processedData;
}
/**
* sets if we include HTML in teasers
* (do we want this is <b>word</b> or this is word as teaser)
* @param value
*/
public void setIncludeHTML(boolean value) {
includeHTMLtags=value;
}
/**
* method to show data without encoding
*/
public static String removeHiddenMarkers(String contents) {
//trap null
if(contents==null)
return null;
//run though the string extracting our markers
//make sure has markers and ignore if not
if (!contents.contains(MARKER))
return contents;
//strip the markers
StringTokenizer tokens = new StringTokenizer(contents, MARKER, true);
String temp_token;
StringBuffer processed_data = new StringBuffer();
//with a token to make sure cleanup works
while (tokens.hasMoreTokens()) {
//encoding in data
temp_token = tokens.nextToken(); //see
// if
// first
// marker
if (temp_token.equals(MARKER)) {
tokens.nextToken(); //point character starts
tokens.nextToken(); //second marker
tokens.nextToken(); //width
tokens.nextToken(); //third marker
//put back chars
processed_data = processed_data.append(tokens.nextToken());
//value
} else
processed_data = processed_data.append(temp_token);
}
return processed_data.toString();
}
/**
* Method to try and find vertical lines in close data
* (not as efficient as it could be)
* @throws PdfException
*/
private void findVerticalLines(float minX,float minY,float maxX,float maxY,int currentWritingMode) throws PdfException {
//hold counters on all x values
HashMap xLines = new HashMap();
//counter on most popular item
int most_frequent = 0, count = pdf_data.getRawTextElementCount();
float x1, x2, y1, y2;
String raw;
for (int i = 0; i < count; i++) {
float currentX = 0, lastX;
Integer intX;
//extract values for data
raw = this.pdf_data.contents[i];
/**
* set pointers so left to right text
*/
if(currentWritingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
x1=this.f_x1[i];
x2=this.f_x2[i];
y1=this.f_y1[i];
y2=this.f_y2[i];
}else if(currentWritingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
x2=this.f_x1[i];
x1=this.f_x2[i];
y1=this.f_y1[i];
y2=this.f_y2[i];
}else if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
x1=this.f_y1[i];
x2=this.f_y2[i];
y1=this.f_x2[i];
y2=this.f_x1[i];
}else if(currentWritingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
x1=this.f_y2[i];
x2=this.f_y1[i];
y2=this.f_x1[i];
y1=this.f_x2[i];
}else{
throw new PdfException("Illegal value "+currentWritingMode+"for currentWritingMode");
}
//if in the area, process
if ((x1 > minX - .5)&& (x2 < maxX + .5)&& (y2 > minY - .5)&& (y1 < maxY + .5)) {
//run though the string extracting our markers to get x values
StringTokenizer tokens =new StringTokenizer(raw, MARKER, true);
String value, lastValue = "";
Object currentValue;
while (tokens.hasMoreTokens()) {
//encoding in data
value = tokens.nextToken(); //see if first marker
if (value.equals(MARKER)) {
value = tokens.nextToken(); //point character starts
if (value.length() > 0) {
lastX = currentX;
currentX = Float.parseFloat(value);
try {
//add x to list or increase counter at start
// or on space
//add points either side of space
if (lastValue.length() == 0 || (lastValue.indexOf(' ') != -1)) {
intX = (int) currentX;
currentValue = xLines.get(intX);
if (currentValue == null) {
xLines.put(intX, 1);
} else {
int countReached = (Integer) currentValue;
countReached++;
if (countReached > most_frequent)
most_frequent = countReached;
xLines.put(intX, countReached);
}
//work out the middle
int middle =(int) (lastX+ ((currentX - lastX) / 2));
if (lastX != 0) {
intX = middle;
currentValue = xLines.get(intX);
if (currentValue == null) {
xLines.put(intX, 1);
} else {
int count_reached = (Integer) currentValue;
count_reached++;
if (count_reached > most_frequent)
most_frequent = count_reached;
xLines.put(intX, count_reached);
}
}
}
} catch (Exception e) {
LogWriter.writeLog(
"Exception " + e + " stripping x values");
}
}
tokens.nextToken(); //second marker
tokens.nextToken(); //glyph width
tokens.nextToken(); //third marker
value = tokens.nextToken(); //put back chars
lastValue = value;
}
}
}
}
//now analyse the data
Iterator keys = xLines.keySet().iterator();
int minimum_needed = most_frequent / 2;
while (keys.hasNext()) {
Integer current_key = (Integer) keys.next();
int current_count = (Integer) xLines.get(current_key);
if (current_count > minimum_needed)
lineBreaks.addElement(current_key);
}
}
/**
* Method splitFragments adds raw frgaments to processed fragments breaking
* up any with vertical lines through or what looks like tabbed spaces
* @throws PdfException
*/
private void copyToArrays(
float minX,float minY,float maxX,float maxY,
boolean keepFont,boolean breakOnSpace,boolean findLines,String punctuation, boolean isWordlist) throws PdfException {
final boolean debugSplit=false;
//initialise local arrays allow for extra space
int count = pdf_data.getRawTextElementCount() + increment;
f_x1 = new float[count];
f_colorTag=new String[count];
hadSpace=new boolean[count];
f_x2 = new float[count];
f_y1 = new float[count];
f_y2 = new float[count];
spaceWidth = new float[count];
content = new StringBuffer[count];
fontSize = new int[count];
textLength = new int[count];
writingMode=new int[count];
isUsed=new boolean[count];
moveType=new int[count];
//flag to find lines based on orientation of first text item*/
boolean linesScanned=false;
//set defaults and calculate dynamic values
int text_length;
count = count-increment;
float last_pt,min,max,pt,x1,x2,y1,y2,linePos,character_spacing;
String raw, char_width = "",currentColor;
StringBuffer text = new StringBuffer();
//work through fragments
for (int i = 0; i < count; i++) {
//extract values
character_spacing = pdf_data.f_character_spacing[i];
raw = pdf_data.contents[i];
x1 = pdf_data.f_x1[i];
currentColor=pdf_data.colorTag[i];
x2 = pdf_data.f_x2[i];
y1 = pdf_data.f_y1[i];
y2 = pdf_data.f_y2[i];
text_length = pdf_data.text_length[i];
int mode=pdf_data.f_writingMode[i];
int moveType=pdf_data.move_command[i];
/**
* see if in area
*/
boolean accepted=false;
if(debugSplit){
System.out.println("raw data="+raw);
System.out.println("text data="+PdfGroupingAlgorithms.removeHiddenMarkers(raw));
}
//if at least partly in the area, process
if ((mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT) &&
y2 > minY && y1 < maxY && x1<maxX && x2>minX){
accepted=true;
}else if((mode==PdfData.VERTICAL_BOTTOM_TO_TOP || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)&&
x1 > minX && x2 < maxX && y1>minY && y2<maxY)
accepted=true;
if(accepted){
/**find lines*/
//look for possible vertical or horizontal lines in the data
if((!linesScanned)&&(findLines)){
findVerticalLines(minX, minY, maxX, maxY,mode);
linesScanned=true;
}
/**
* initialise pointers and work out an
* 'average character space'
**/
if (mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
//space = (x2 - x1) / text_length;
pt = x1;
last_pt = x1;
min=minX;
max=maxX;
}else{ //vertical text
//space = (y1 - y2) / text_length;
pt = y2;
last_pt = y2;
min=minY;
max=maxY;
}
linePos = -1;
/**
* work through text, using embedded markers to work out whether
* each letter is IN or OUT
*/
char[] line=raw.toCharArray();
int end=line.length;
int pointer=0;
String value, textValue = "", pt_reached;
//allow for no tokens and return all text fragment
if (!raw.contains(MARKER))
text = new StringBuffer(raw);
boolean isFirstValue=true, breakPointset=false;
/**
* work through text, using embedded markers to work out whether
* each letter is IN or OUT
*/
while(pointer<end){
//only data between min and y locations
while (true) {
/**
* read value
*/
if(line[pointer]!=MARKER2){
//find second marker and get width
int startPointer=pointer;
while((pointer<end)&&(line[pointer]!=MARKER2))
pointer++;
value = raw.substring(startPointer,pointer);
}else{//if (value.equals(MARKER)) { // read the next token and its location and width
//find first marker
while((pointer<end)&&(line[pointer]!=MARKER2))
pointer++;
pointer++;
//find second marker and get width
int startPointer=pointer;
while((pointer<end)&&(line[pointer]!=MARKER2))
pointer++;
pt_reached = raw.substring(startPointer,pointer);
pointer++;
//find third marker
startPointer=pointer;
while((pointer<end)&&(line[pointer]!=MARKER2))
pointer++;
char_width=raw.substring(startPointer,pointer);
pointer++;
//find next marker
startPointer=pointer;
while((pointer<end)&&(line[pointer]!=MARKER2))
pointer++;
value = raw.substring(startPointer,pointer);
textValue = value; //keep value with no spaces
if (pt_reached.length() > 0) { //set point character starts
last_pt = pt;
pt = Float.parseFloat(pt_reached);
if(breakPointset){
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
x1 = pt;
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
x2 = pt;
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
y2=pt;
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
y1=pt;
breakPointset=false;
}
}
//add font start if needed
if ((isXMLExtraction)&&(last_pt < min)&& (pt > min)&& (!value.startsWith(Fonts.fb)))
value = Fonts.getActiveFontTag(raw, "")+ value;
}
if ((pt > min) & (pt < max)){
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
if((x1<min || x1>max) && pt>=min)
x1 = pt;
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
if((x2>max || x2<min) && pt<=max)
x2 = pt;
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
if((y2<min || y2>max) && pt>=min)
y2=pt;
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
if((y1<min || y1>max) && pt<=min)
y1=pt;
break;
}
value = "";
textValue = "";
if(pointer>=end)
break;
}
/**make sure font not sliced off on first value*/
if((isFirstValue)){
isFirstValue=false;
if((isXMLExtraction)&&(keepFont)&&(!value.startsWith(Fonts.fb))&&(!value.startsWith(GenericColorSpace.cb)))//&&(!text.toString().startsWith(Fonts.fb))))
text.append(Fonts.getActiveFontTag(text.toString(), raw));
}
/**
* we now have a valid value inside the selected area so perform tests
*/
//see if a break occurs
boolean is_broken = false;
if(findLines && character_spacing > 0 && text.toString().endsWith(" ")) {
int counts = lineBreaks.size();
for (int jj = 0; jj < counts; jj++) {
int test_x = lineBreaks.elementAt(jj);
if ((last_pt < test_x) & (pt > test_x)) {
jj = counts;
is_broken = true;
}
}
}
boolean endsWithPunctuation = checkForPunctuation(textValue,punctuation);
if (is_broken) { //break on double-spaces or larger
if(debugSplit)
System.out.println("Break 1 is_broken");
float Nx1=x1,Nx2=x2,Ny1=y1,Ny2=y2;
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
Nx2 = last_pt + Float.parseFloat(char_width);
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
Nx1 = last_pt + Float.parseFloat(char_width);
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
Ny1=last_pt + Float.parseFloat(char_width);
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
Ny2=last_pt + Float.parseFloat(char_width);
addFragment(moveType,i,text,Nx1,Nx2,Ny1,Ny2,text_length,keepFont,currentColor,isWordlist);
text =new StringBuffer(Fonts.getActiveFontTag(text.toString(), raw));
text.append(value);
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
x1 = pt;
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
x2 = pt;
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
y2=pt;
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
y1=pt;
} else if ((endsWithPunctuation)|
((breakOnSpace) && ((textValue.indexOf(' ') != -1)||(value.endsWith(" "))))|((textValue.contains(" ")))) {//break on double-spaces or larger
if(debugSplit)
System.out.println("Break 2 endsWithPunctuation="+endsWithPunctuation+" textValue="+textValue+ '<'+" value="+value+ '<' +" text="+text+ '<');
//Remove final bit of the below if to fix issue in case 11542
if(textValue.length()>1 && textValue.indexOf(' ')!=-1){// && x1==pt){ //add in space values to start of next shape
//count the spaces
int ptr=textValue.indexOf(' ');
if(ptr>0){
pt=pt+ ptr*(Float.parseFloat(char_width)/textValue.length());
}
//else
// pt=pt+Float.parseFloat(char_width);
}
if (!endsWithPunctuation)
text.append(value.trim());
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
if(debugSplit)
System.out.println("Add "+x1+ ' ' +pt+" text="+text+" i="+i);
addFragment(moveType,i,text,x1,pt,y1,y2,text_length,keepFont,currentColor,isWordlist);
}else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
if(debugSplit)
System.out.println("b");
addFragment(moveType,i,text,pt,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
}else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP){
if(debugSplit)
System.out.println("c");
addFragment(moveType,i,text,x1,x2,pt,y2,text_length,keepFont,currentColor,isWordlist);
}else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM) {
if(debugSplit)
System.out.println("d");
addFragment(moveType,i,text,x1,x2,y1,pt,text_length,keepFont,currentColor,isWordlist);
}
if(char_width.length()>0){ //add in space values to start of next shape
//count the spaces
int ptr=0;
if(textValue.indexOf(' ')!=-1)
ptr=textValue.indexOf(' ');
if(isWordlist){
int len=textValue.length();
while(ptr<len && textValue.charAt(ptr)==' '){
ptr++;
}
}
if(ptr>0)
pt=pt+ ptr*Float.parseFloat(char_width);
else
pt=pt+Float.parseFloat(char_width);
if(ptr>0)
breakPointset=true;
else
breakPointset=false;
}
//store fact it had a space in case we generate wordlist
if((breakOnSpace)&(nextSlot>0))
hadSpace[nextSlot-1]=true;
text =new StringBuffer(Fonts.getActiveFontTag(text.toString(), raw));
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
x1 = pt;// + space;
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
x2 = pt;// - space;
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
y2 = pt;// + space;
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
y1 = pt;// - space;
} else if ((linePos != -1) & (pt > linePos)) {//break on a vertical line
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
addFragment(moveType,i,text,x1,linePos,y1,y2,text_length,keepFont,currentColor,isWordlist);
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
addFragment(moveType,i,text,linePos,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
addFragment(moveType,i,text,x1,x2,linePos,y2,text_length,keepFont,currentColor,isWordlist);
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
addFragment(moveType,i,text,x1,x2,y1,linePos,text_length,keepFont,currentColor,isWordlist);
text =new StringBuffer(Fonts.getActiveFontTag(text.toString(), raw));
text.append(value);
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
x1 = linePos;
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
x2 = linePos;
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
y2 = linePos;
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
y1 = linePos;
linePos = -1;
} else { //allow for space used as tab
if ((isXMLExtraction)&&(value.endsWith(' ' +Fonts.fe))) {
value = Fonts.fe;
textValue = "";
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
x2 = last_pt;
else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
x1=last_pt;
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
y1 = last_pt;
else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
y2 = last_pt;
}
text.append(value);
}
}
//trap scenario we found if all goes through with no break at end
if((keepFont)&&(isXMLExtraction)&&
(!text.toString().endsWith(Fonts.fe))&&
(!text.toString().endsWith(GenericColorSpace.ce)))
text.append(Fonts.fe);
//create new line with what is left and output
if (mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
if (x1 < x2)
addFragment(moveType,i,text,x1,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
}else if (mode==PdfData.VERTICAL_BOTTOM_TO_TOP || mode==PdfData.VERTICAL_TOP_TO_BOTTOM){
if(y1 > y2)
addFragment(moveType,i,text,x1,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
}
text = new StringBuffer();
}
}
//local lists for faster access
isUsed = new boolean[nextSlot];
}
/**
* @param textValue
* @return
*/
private static boolean checkForPunctuation(String textValue,String punctuation) {
if(punctuation==null)
return false;
/** see if ends with punctuation */
boolean endsWithPunctuation = false;
int textLength = textValue.length();
int ii = textLength - 1;
if (textLength > 0) { //strip any spaces and tags in test
char testChar = textValue.charAt(ii);
boolean inTag = (testChar == '>');
while (((inTag) | (testChar == ' ')) & (ii > 0)) {
if (testChar == '<')
inTag = false;
ii--;
testChar = textValue.charAt(ii);
if (testChar == '>')
inTag = true;
}
//stop matches on &;
if((testChar==';')){
//ignore if looks like &xxx;
endsWithPunctuation = true;
ii--;
while(ii>-1){
testChar=textValue.charAt(ii);
if(testChar=='&' || testChar=='#'){
endsWithPunctuation = false;
ii=0;
}
if(ii==0 || testChar==' ' || !Character.isLetterOrDigit(testChar))
break;
ii--;
}
}else if (punctuation.indexOf(testChar) != -1)
endsWithPunctuation = true;
}
return endsWithPunctuation;
}
/**
* add an object to our new XML list
*/
private void addFragment(
int moveType,
int index,
StringBuffer contentss,
float x1,
float x2,
float y1,
float y2,
int text_len,
boolean keepFontTokens,String currentColorTag,boolean isWordlist) {
StringBuffer current_text = contentss;
String str=current_text.toString();
//strip <> or ascii equivalents
if(isWordlist){
if(str.contains("&#"))
current_text=Strip.stripAmpHash(current_text);
if((isXMLExtraction)&&((str.contains("<"))||(str.contains(">"))))
current_text=Strip.stripXMLArrows(current_text,true);
else if((!isXMLExtraction)&&((str.indexOf('<')!=-1)||(str.indexOf('>')!=-1)))
current_text=Strip.stripArrows(current_text);
}
// StringBuffer justText=Strip.stripXML(current_text);
//ignore blank space objects
//if (justText.length() == 0) {
if(getFirstChar(current_text)!=-1){
//strip tags or pick up missed </font> if ends with space
if (keepFontTokens == false) {
//strip fonts if required
current_text = Strip.stripXML(current_text,isXMLExtraction);
} else if (isXMLExtraction){
//no color tag
if(pdf_data.isColorExtracted()&&(!current_text.toString().endsWith(GenericColorSpace.ce))){
//se
//if ends </font> add </color>
//otherwise add </font></color>
if(!current_text.toString().endsWith(Fonts.fe))
current_text = current_text.append(Fonts.fe);
current_text = current_text.append(GenericColorSpace.ce);
}else if((!pdf_data.isColorExtracted())&&(!current_text.toString().endsWith(Fonts.fe)))
current_text = current_text.append(Fonts.fe);
}
/***/
//add to vacant slot or create new slot
int count = f_x1.length;
if (nextSlot < count) {
f_x1[nextSlot] = x1;
f_colorTag[nextSlot]=currentColorTag;
f_x2[nextSlot] = x2;
f_y1[nextSlot] = y1;
f_y2[nextSlot] = y2;
this.moveType[nextSlot]=moveType;
fontSize[nextSlot] = pdf_data.f_end_font_size[index];
writingMode[nextSlot]=pdf_data.f_writingMode[index];
textLength[nextSlot] = text_len;
spaceWidth[nextSlot] = pdf_data.space_width[index];
content[nextSlot] = current_text;
nextSlot++;
} else {
count = count + increment;
float[] t_x1 = new float[count];
String[] t_colorTag=new String[count];
float[] t_x2 = new float[count];
float[] t_y1 = new float[count];
float[] t_y2 = new float[count];
float[] t_spaceWidth = new float[count];
StringBuffer[] t_content = new StringBuffer[count];
int[] t_font_size = new int[count];
int[] t_text_len = new int[count];
int[] t_writingMode=new int[count];
int[] t_moveType=new int[count];
boolean[] t_isUsed = new boolean[count];
boolean[]t_hadSpace=new boolean[count];
//copy in existing
for (int i = 0; i < count - increment; i++) {
t_x1[i] = f_x1[i];
t_colorTag[i]=f_colorTag[i];
t_x2[i] = f_x2[i];
t_y1[i] = f_y1[i];
t_y2[i] = f_y2[i];
t_hadSpace[i]=hadSpace[i];
t_spaceWidth[i] = spaceWidth[i];
t_content[i] = content[i];
t_font_size[i] = fontSize[i];
t_writingMode[i]=writingMode[i];
t_text_len[i] = textLength[i];
t_isUsed[i] = isUsed[i];
t_moveType[i]=this.moveType[i];
}
f_x1 = t_x1;
f_colorTag=t_colorTag;
hadSpace=t_hadSpace;
f_x2 = t_x2;
f_y1 = t_y1;
f_y2 = t_y2;
isUsed=t_isUsed;
fontSize = t_font_size;
writingMode=t_writingMode;
textLength = t_text_len;
spaceWidth = t_spaceWidth;
content = t_content;
this.moveType=t_moveType;
f_x1[nextSlot] = x1;
f_colorTag[nextSlot]=currentColorTag;
f_x2[nextSlot] = x2;
f_y1[nextSlot] = y1;
f_y2[nextSlot] = y2;
fontSize[nextSlot] = pdf_data.f_end_font_size[index];
writingMode[nextSlot]=pdf_data.f_writingMode[index];
t_text_len[nextSlot] = text_len;
content[nextSlot] = current_text;
spaceWidth[nextSlot] = pdf_data.space_width[index];
this.moveType[nextSlot]=moveType;
nextSlot++;
} /***/
}
}
//////////////////////////////////////////////////////////////////////
/**
* put rows together into one object with start and end
*/
private void mergeTableRows(int border_width) {
//merge row contents
String separator ="</tr>\n<tr>";
if (isXHTML == false)
separator = "\n";
master = ((Vector_Int) lines.elementAt(line_order[0])).elementAt(0);
int item;
for (int rr = 1; rr < max_rows; rr++) {
item =((Vector_Int) lines.elementAt(line_order[rr])).elementAt(0);
if(content[master]==null)
master=item;
else if(content[item]!=null)
merge(master,item,separator,false);
}
//add start/end marker
if (isXHTML) {
if (border_width == 0){
content[master].insert(0,"<TABLE>\n<tr>");
content[master].append("</tr>\n</TABLE>\n");
}else{
StringBuffer startTag=new StringBuffer("<TABLE border='");
startTag.append(String.valueOf(border_width));
startTag.append( "'>\n<tr>");
startTag.append(content[master]);
content[master]=startTag;
content[master].append("</tr>\n</TABLE>\n");
}
}
}
//////////////////////////////////////////////////
/**
* get list of unused fragments and put in list and sort in sorted_items
*/
final private int[] getsortedUnusedFragments(
boolean sortOnX,
boolean use_y1) {
int total_fragments = isUsed.length;
//get unused item pointers
int ii = 0;
int sorted_temp_index[] = new int[total_fragments];
for (int i = 0; i < total_fragments; i++) {
if (isUsed[i] == false) {
sorted_temp_index[ii] = i;
ii++;
}
}
int[] unsorted_items = new int[ii];
int[] sorted_items;
int[] sorted_temp_x1 = new int[ii];
int[] sorted_temp_y1 = new int[ii];
int[] sorted_temp_y2 = new int[ii];
//put values in array and get x/y for sort
for (int pointer = 0; pointer < ii; pointer++) {
int i = sorted_temp_index[pointer];
unsorted_items[pointer] = i;
sorted_temp_x1[pointer] = (int) f_x1[i];
//negative values to get sort in 'wrong' order from top of page
sorted_temp_y1[pointer] = (int) f_y1[i];
sorted_temp_y2[pointer] = (int) f_y2[i];
}
//sort
if (sortOnX == false) {
if (use_y1 == true)
sorted_items =
Sorts.quicksort(
sorted_temp_y1,
sorted_temp_x1,
unsorted_items);
else
sorted_items =
Sorts.quicksort(
sorted_temp_y2,
sorted_temp_x1,
unsorted_items);
} else
sorted_items =
Sorts.quicksort(sorted_temp_x1, sorted_temp_y1, unsorted_items);
return sorted_items;
}
//////////////////////////////////////////////////////////////////////
/**
* create rows of data from preassembled indices, adding separators. Each
* row is built to a temp array and then row created - we don't know how
* many columns until the table is built
* @throws PdfException
*/
private void createTableRows(
boolean keep_alignment_information,
boolean keep_width_information,int currentWritingMode) throws PdfException {
/**
* create local copies of arrays
*/
float[] f_x1,f_x2;
/**
* set pointers so left to right text
*/
if(currentWritingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
f_x1=this.f_x1;
f_x2=this.f_x2;
//f_y1=this.f_y1;
//f_y2=this.f_y2;
}else if(currentWritingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
f_x2=this.f_x1;
f_x1=this.f_x2;
//f_y1=this.f_y1;
//f_y2=this.f_y2;
}else if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
f_x1=this.f_y2;
f_x2=this.f_y1;
//f_y1=this.f_x2;
//f_y2=this.f_x1;
}else if(currentWritingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
f_x1=this.f_y1;
f_x2=this.f_y2;
//f_y2=this.f_x1;
//f_y1=this.f_x2;
/**
* fiddle x,y co-ords so it works
*/
//get max size
int maxX=0;
for (float aF_x1 : f_x1) {
if (maxX < aF_x1)
maxX = (int) aF_x1;
}
maxX++; //allow for fp error
//turn around
for(int ii=0;ii<f_x2.length;ii++){
f_x1[ii]=maxX-f_x1[ii];
f_x2[ii]=maxX-f_x2[ii];
}
}else{
throw new PdfException("Illegal value "+currentWritingMode+"for currentWritingMode");
}
int item, i, current_col = -1;
int itemsInTable = 0, items_added = 0;
//pointer to current element on each row
int[] currentItem = new int[max_rows];
Vector_Int[] rowContents = new Vector_Int[max_rows];
Vector_String alignments = new Vector_String(); //text alignment
Vector_Float widths = new Vector_Float(); //cell widths
Vector_Float cell_x1 = new Vector_Float(); //cell widths
String separator = "", empty_cell = " ";
if (isXHTML == false) {
separator = "\",\"";
empty_cell = "";
}
/**
* set number of items on each line, column count and populate empty rows
*/
int[] itemCount = new int[max_rows];
for (i = 0; i < max_rows; i++) {
itemCount[i] = ((Vector_Int) lines.elementAt(i)).size() - 1;
//total number of items
itemsInTable = itemsInTable + itemCount[i];
//reset other values
currentItem[i] = 0;
rowContents[i] = new Vector_Int(20);
}
//now work through and split any overlapping items until all done
while (true) {
//size of column and pointers
float x1 = 9999,min_x2 = 9999,x2,current_x1,current_x2,c_x1,next_x1 = 9999,c_x2,items_in_column = 0;
current_col++;
boolean all_done = true; //flag to exit at end
float total_x1 = 0, total_x2 = 0, left_gap = 0, right_gap;
String alignment = "center";
if (items_added < itemsInTable) {
/**
* work out cell x boundaries on basis of objects
*/
for (i = 0; i < max_rows; i++) { //get width for column
if (itemCount[i] > currentItem[i]) { //item id
item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
current_x1 = f_x1[item];
current_x2 = f_x2[item];
if (current_x1 < x1) //left margin
x1 = current_x1;
if (current_x2 < min_x2) //right margin if appropriate
min_x2 = current_x2;
}
}
cell_x1.addElement(x1); //save left margin
x2 = min_x2; //set default right margin
/**
* workout end and next column start by scanning all items
*/
for (i = 0;i < max_rows;i++) { //slot the next item on each row together work out item
item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
c_x1 = f_x1[item];
c_x2 = f_x2[item];
//max item width of this column
if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 > x2))
x2 = c_x2;
if (currentItem[i] < itemCount[i]) { //next left margin
item =((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i] + 1);
current_x1 = f_x1[item];
if ((current_x1 > min_x2) & (current_x1 < next_x1))
next_x1 = current_x1;
}
}
//stop infinite loop case
if(x1==x2)
break;
//allow for last column
if (next_x1 == 9999)
next_x1 = x2;
/**
* count items in table and workout raw totals for alignment.
* Also work out widest x2 in column
*/
for (i = 0;i < max_rows;i++) { //slot the next item on each row together
//work out item
item =((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
c_x1 = f_x1[item];
c_x2 = f_x2[item];
//use items in first column of single colspan
if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 <= next_x1)) {
//running totals to calculate alignment
total_x1 = total_x1 + c_x1;
total_x2 = total_x2 + c_x2;
items_in_column++;
}
}
/**
* work out gap and include empty space between cols and save
*/
if (i == 0)
left_gap = x1;
if (next_x1 == -1)
right_gap = 0;
else
right_gap = (int) ((next_x1 - x2) / 2);
int width = (int) (x2 - x1 + right_gap + left_gap);
//noinspection UnusedAssignment,UnusedAssignment
left_gap = right_gap;
widths.addElement(width);
/** workout the alignment */
float x1_diff = (total_x1 / items_in_column) - x1;
float x2_diff = x2 - (total_x2 / items_in_column);
if (x1_diff < 1)
alignment = "left";
else if (x2_diff < 1)
alignment = "right";
alignments.addElement(alignment);
for (i = 0;i < max_rows;i++) { //slot the next item on each row together
master = ((Vector_Int) lines.elementAt(i)).elementAt(0);
//get next item on line or -1 for no more
if (itemCount[i] > currentItem[i]) {
//work out item
item =((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
c_x1 = f_x1[item];
c_x2 = f_x2[item];
all_done = false;
} else {
item = -1;
c_x1 = -1;
c_x2 = -1;
}
if ((item == -1) & (items_added <= itemsInTable)) {
//all items in table so just filling in gaps
rowContents[i].addElement(-1);
} else if ((c_x1 >= x1) & (c_x1 < x2)) {
//fits into cell so add in and roll on marker
rowContents[i].addElement(item);
currentItem[i]++;
items_added++;
} else if (c_x1 > x2) { //empty cell
rowContents[i].addElement(-1);
}
}
}
if (all_done)
break;
}
//===================================================================
/**
* now assemble rows
*/
for (int row = 0; row < max_rows; row++) {
StringBuffer line_content = new StringBuffer();
int count = rowContents[row].size() - 1;
master = ((Vector_Int) lines.elementAt(row)).elementAt(0);
for (i = 0; i < count; i++) {
item = rowContents[row].elementAt(i);
if (isXHTML) {
//get width
float current_width = widths.elementAt(i);
String current_alignment = alignments.elementAt(i);
int test, colspan = 1, pointer = i + 1;
if (item != -1) {
//look for colspan
while (true) {
test = rowContents[row].elementAt(i + 1);
if ((test != -1) | (count == i + 1))
break;
//break if over another col - roll up single value on line
if ((itemCount[row] > 1)& (cell_x1.elementAt(i + 1) > f_x2[item]))
break;
count--;
rowContents[row].removeElementAt(i + 1);
colspan++;
//update width
current_width =current_width + widths.elementAt(pointer);
pointer++;
}
}
line_content.append("<td");
if (keep_alignment_information) {
line_content.append(" align='");
line_content.append(current_alignment);
line_content.append('\'');
if (colspan > 1)
line_content.append(" colspan='").append(colspan).append('\'');
}
if (keep_width_information)
line_content.append(" width='").append((int) current_width).append('\'');
line_content.append(" nowrap>");
if (item == -1)
line_content.append(empty_cell);
else
line_content.append(content[item]);
line_content.append("</td>");
} else { //csv
if (item == -1) //empty col
line_content.append("\"\",");
else{ //value
line_content.append('\"');
line_content.append(content[item]);
line_content.append("\",");
}
}
//merge to update other values
if ((item != -1) && (master != item)) //merge tracks the shape
merge(master,item,separator,false);
}
//substitute our 'hand coded' value
content[master] = line_content;
}
}
/**
* work through data and create a set of rows and return an object with
* refs for each line
* @throws PdfException
*/
private void createLinesInTable(int itemCount, int[] items,boolean addSpaceXMLTag,int mode) throws PdfException {
/**
* reverse order if text right to left
*/
if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
items=reverse(items);
/**
* create and populate local copies of arrays
*/
float[] f_x1,f_x2,f_y1,f_y2;
// set pointers so always left to right text
switch(mode){
case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
f_x1=this.f_x1;
f_x2=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
break;
case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
f_x2=this.f_x1;
f_x1=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
break;
case PdfData.VERTICAL_BOTTOM_TO_TOP:
f_x1=this.f_y1;
f_x2=this.f_y2;
f_y1=this.f_x2;
f_y2=this.f_x1;
break;
case PdfData.VERTICAL_TOP_TO_BOTTOM:
f_x1=this.f_y2;
f_x2=this.f_y1;
f_y2=this.f_x1;
f_y1=this.f_x2;
items = this.getsortedUnusedFragments(false, true);
items=reverse(items);
break;
default:
throw new PdfException("Illegal value "+mode+"for currentWritingMode");
}
//holds line we're working on
Vector_Int current_line;
for (int j = 0; j < itemCount; j++) { //for all items
int c=items[j],id = -1, i,last = c;
float smallest_gap = -1, gap, yMidPt;
if(!isUsed[c] && this.writingMode[c]==mode) {
//reset pointer and add this element
current_line = new Vector_Int(20);
current_line.addElement(c);
lineY2.addElement((int) f_y2[c]);
//look for items along same line (already sorted into order left to right)
while (true) { //look for a match
for (int ii = 0; ii < itemCount; ii++) {
i = items[ii];
if (!isUsed[i] && i!=c && writingMode[c]==mode && ((f_x1[i] > f_x1[c] && mode!=PdfData.VERTICAL_TOP_TO_BOTTOM)||(f_x1[i] < f_x1[c] && mode==PdfData.VERTICAL_TOP_TO_BOTTOM))) { //see if on right
gap = (f_x1[i] - f_x2[c]);
if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
gap=-gap;
//allow for fp error
if (gap < 0 && gap > -2)
gap = 0;
//make sure on right
yMidPt = (f_y1[i] + f_y2[i]) / 2;
//see if line & if only or better fit
if (yMidPt < f_y1[c] && yMidPt > f_y2[c] && (smallest_gap < 0 || gap < smallest_gap)) {
smallest_gap = gap;
id = i;
}
}
}
if (id == -1) //exit when no more matches
break;
//merge in best match if fit found with last or if overlaps by less than half a space,otherwise join
float t = f_x1[id] - f_x2[last],possSpace=f_x1[id]-f_x2[c];
float av_char1 =(float)1.5 *((f_x2[id] - f_x1[id])/ textLength[id]);
float av_char2 =(float)1.5 *((f_x2[last] - f_x1[last]) / textLength[last]);
if((mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)){
possSpace=-possSpace;
t=-t;
av_char1=-av_char1;
av_char2=-av_char2;
}
if (t < av_char1 && t < av_char2) {
merge(last,id, isGapASpace(id, last, possSpace,addSpaceXMLTag,mode),true);
} else {
current_line.addElement(id);
last = id;
}
//flag used and reset variables used
isUsed[id] = true;
id = -1;
smallest_gap = 1000000;
}
//add line to list
lines.addElement(current_line);
max_rows++;
}
}
}
/**
*
* calls various low level merging routines on merge -
*
* isCSV sets if output is XHTML or CSV format -
*
* XHTML also has options to include font tags (keepFontInfo),
* preserve widths (keepWidthInfo), try to preserve alignment
* (keepAlignmentInfo), and set a table border width (borderWidth)
* - AddCustomTags should always be set to false
*
* @param x1 is the x coord of the top left corner
* @param y1 is the y coord of the top left corner
* @param x2 is the x coord of the bottom right corner
* @param y2 is the y coord of the bottom right corner
* @param pageNumber is the page you wish to extract from
* @param isCSV is a boolean. If false the output is xhtml if true the text is out as CSV
* @param keepFontInfo if true and isCSV is false keeps font information in extrated text.
* @param keepWidthInfo if true and isCSV is false keeps width information in extrated text.
* @param keepAlignmentInfo if true and isCSV is false keeps alignment information in extrated text.
* @param borderWidth is the width of the border for xhtml
* @return Map containing text found in estimated table cells
* @throws PdfException If the co-ordinates are not valid
*/
public final Map extractTextAsTable(
int x1,
int y1,
int x2,
int y2,
int pageNumber,
boolean isCSV,
boolean keepFontInfo,
boolean keepWidthInfo,
boolean keepAlignmentInfo,
int borderWidth)
throws PdfException {
//check in correct order and throw exception if not
int[] v = validateCoordinates(x1, y1, x2, y2);
x1 = v[0];
y1 = v[1];
x2 = v[2];
y2 = v[3];
/** return the content as an Element */
Map table_content = new Hashtable();
LogWriter.writeLog("extracting Text As Table");
//flag type of table so we can add correct separators
if (isCSV == true) {
isXHTML = false;
} else {
isXHTML = true;
}
//init table variables
lines = new Vector_Object(20);
lineY2 = new Vector_Int(20);
max_rows = 0;
//init store for data
copyToArrays(x1, y2, x2, y1, keepFontInfo, false,true,null,false);
//initial grouping and delete any hidden text
removeEncoding();
//eliminate shadows and also merge overlapping text
cleanupShadowsAndDrownedObjects(false);
int[] items = this.getsortedUnusedFragments(true, false);
int item_count = items.length; //number of items
if(item_count==0)
return table_content;
/**
* check orientation and get preferred. Items not correct will
* be ignored
*/
int writingMode=getWritingMode(items,item_count);
String message ="Table Merging algorithm being applied " + (item_count) + " items";
LogWriter.writeLog(message);
/**
* scan all items joining best fit to right of each fragment to build
* lines
*/
if (item_count > 1) {
//workout the raw lines
createLinesInTable(item_count, items,isXHTML,writingMode);
/**
* generate lookup with lines in correct order (minus used to get
* correct order down the page)
*/
int dx=1;
if(writingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || writingMode==PdfData.VERTICAL_TOP_TO_BOTTOM)
dx=-1;
line_order = new int[max_rows];
int[] line_y=new int[max_rows];
for (int i = 0; i < max_rows; i++) {
line_y[i] = dx*lineY2.elementAt(i);
line_order[i] = i;
}
line_order = Sorts.quicksort(line_y, line_order);
//assemble the rows and columns
createTableRows(keepAlignmentInfo, keepWidthInfo,writingMode);
//assemble the rows and columns
mergeTableRows(borderWidth);
}
content[master]=cleanup(content[master]);
String processed_value = content[master].toString();
if(processed_value!=null){
// cleanup data if needed by removing duplicate font tokens
if (!isCSV)
processed_value = Fonts.cleanupTokens(processed_value);
table_content.put("content", processed_value);
table_content.put("x1", String.valueOf(x1));
table_content.put("x2", String.valueOf(x2));
table_content.put("y1", String.valueOf(y1));
table_content.put("y2", String.valueOf(y2));
}
return table_content;
}
/** make sure co-ords valid and throw exception if not */
private static int[] validateCoordinates(int x1, int y1, int x2, int y2)
throws PdfException {
if ((x1 > x2) | (y1 < y2)) {
// String errorMessage = "Invalid parameters for text rectangle. ";
if (x1 > x2){
// errorMessage =
// errorMessage
// + "x1 value ("
// + x1
// + ") must be LESS than x2 ("
// + x2
// + "). ";
int temp = x1;
x1 = x2;
x2 = temp;
LogWriter.writeLog("x1 > x2, coordinates were swapped to validate");
}
if (y1 < y2){
// errorMessage =
// errorMessage
// + "y1 value ("
// + y1
// + ") must be MORE than y2 ("
// + y2
// + "). ";
int temp = y1;
y1 = y2;
y2 = temp;
LogWriter.writeLog("y1 < y2, coordinates were swapped to validate");
}
// throw new PdfException(errorMessage);
}
return new int[]{x1,y1,x2,y2};
}
/**
*
* algorithm to place data from within coordinates to a vector of word, word coords (x1,y1,x2,y2)
*
* @param x1 is the x coord of the top left corner
* @param y1 is the y coord of the top left corner
* @param x2 is the x coord of the bottom right corner
* @param y2 is the y coord of the bottom right corner
* @param page_number is the page you wish to extract from
* @param breakFragments will divide up text based on white space characters
* @param punctuation is a string containing all values that should be used to divide up words
* @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
* @throws PdfException If the co-ordinates are not valid
*/
final public Vector extractTextAsWordlist(
int x1,
int y1,
int x2,
int y2,
int page_number,
boolean breakFragments,
String punctuation)
throws PdfException {
/** make sure co-ords valid and throw exception if not */
int[] v = validateCoordinates(x1, y1, x2, y2);
x1 = v[0];
y1 = v[1];
x2 = v[2];
y2 = v[3];
/** extract the raw fragments (Note order or parameters passed) */
if (breakFragments)
copyToArrays(x1, y2, x2, y1, true, true,false,punctuation,true);
else
copyToArrays();
/** delete any hidden text */
removeEncoding();
//eliminate shadows and also merge overlapping text
cleanupShadowsAndDrownedObjects(true);
int[] items = getsortedUnusedFragments(true, false);
int count = items.length;
/**if no values return null
*/
if(count==0){
LogWriter.writeLog("Less than 1 text item on page");
return null;
}
/**
* check orientation and get preferred. Items not correct will
* be ignored
*/
int writingMode=getWritingMode(items,count);
/**
* build set of lines from text
*/
createLines(count, items,writingMode,true,false,false);
/**
* alter co-ords to rotated if requested
*/
float[] f_x1=null,f_x2=null,f_y1=null,f_y2=null;
if(useUnrotatedCoords || writingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
f_x1=this.f_x1;
f_x2=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
}else if(writingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
f_x2=this.f_x1;
f_x1=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
}else if(writingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
f_x1=this.f_y2;
f_x2=this.f_y1;
f_y1=this.f_x2;
f_y2=this.f_x1;
}else if(writingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
f_x1=this.f_y1;
f_x2=this.f_y2;
f_y2=this.f_x1;
f_y1=this.f_x2;
}
/** put into a Vector */
Vector values = new Vector();
for (int i = 0; i < content.length; i++) {
if (content[i] != null) {
// System.out.println(">>>>>"+content[i]);
if((colorExtracted)&&(isXMLExtraction)){
if(!content[i].toString().toLowerCase().startsWith(GenericColorSpace.cb)){
content[i].insert(0,f_colorTag[master]);
}
if(!content[i].toString().toLowerCase().endsWith(GenericColorSpace.ce)){
content[i].append(GenericColorSpace.ce);
}
}
if(isXMLExtraction)
values.add((content[i]).toString());
else
values.add(Strip.convertToText((content[i]).toString(), isXMLExtraction));
if((!useUnrotatedCoords)&&(writingMode==PdfData.VERTICAL_TOP_TO_BOTTOM)){
values.add(String.valueOf(f_x1[i]));
values.add(String.valueOf(f_y1[i]));
values.add(String.valueOf(f_x2[i]));
values.add(String.valueOf(f_y2[i]));
}else if((!useUnrotatedCoords)&&(writingMode==PdfData.VERTICAL_BOTTOM_TO_TOP)){
values.add(String.valueOf(f_x1[i]));
values.add(String.valueOf(f_y2[i]));
values.add(String.valueOf(f_x2[i]));
values.add(String.valueOf(f_y1[i]));
}else{
values.add(String.valueOf(f_x1[i]));
values.add(String.valueOf(f_y1[i]));
values.add(String.valueOf(f_x2[i]));
values.add(String.valueOf(f_y2[i]));
}
}
}
LogWriter.writeLog("Text extraction as wordlist completed");
return values;
}
/**
* reset global values
*/
private void reset(){
isXHTML = true;
nextSlot=0;
lineBreaks = new Vector_Int();
max_rows = 0;
master = 0;
colorExtracted=false;
}
/**
* algorithm to place data from specified coordinates on a page into a String.
*
* @param x1 is the x coord of the top left corner
* @param y1 is the y coord of the top left corner
* @param x2 is the x coord of the bottom right corner
* @param y2 is the y coord of the bottom right corner
* @param page_number is the page you wish to extract from
* @param estimateParagraphs will attempt to find paragraphs and add new lines in output if true
* @param breakFragments will divide up text based on white space characters if true
* @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
* @throws PdfException If the co-ordinates are not valid
*/
final public String extractTextInRectangle(
int x1,
int y1,
int x2,
int y2,
int page_number,
boolean estimateParagraphs,
boolean breakFragments)
throws PdfException {
reset();
if((breakFragments)&&(!pdf_data.IsEmbedded()))
throw new PdfException("[PDF] Request to breakfragments and width not added. Please add call to init(true) of PdfDecoder to your code.");
/** make sure co-ords valid and throw exception if not */
int[] v = validateCoordinates(x1, y1, x2, y2);
x1 = v[0];
y1 = v[1];
x2 = v[2];
y2 = v[3];
int master, count;
/** extract the raw fragments (Note order or parameters passed) */
if (breakFragments)
copyToArrays(x1, y2, x2, y1, (isXMLExtraction), false,false,null,false);
else
copyToArrays();
/**
* delete any hidden text
*/
removeEncoding();
/**
* eliminate shadows and also merge overlapping text
*/
cleanupShadowsAndDrownedObjects(false);
/** get the fragments as an array */
int[] items = getsortedUnusedFragments(true, false);
count = items.length;
/**if no values return null
*/
if(count==0){
LogWriter.writeLog("Less than 1 text item on page");
return null;
}
/**
* check orientation and get preferred. Items not correct will
* be ignored
*/
int writingMode=getWritingMode(items,count);
/**
* build set of lines from text
*/
createLines(count, items,writingMode,false,isXMLExtraction,false);
/**
* roll lines together
*/
master = mergeLinesTogether(writingMode,estimateParagraphs,x1,x2,y1,y2);
/**
* add final deliminators
*/
if(isXMLExtraction){
content[master] =new StringBuffer(Fonts.cleanupTokens(content[master].toString()));
content[master].insert(0,"<p>");
content[master].append("</p>");
}
LogWriter.writeLog("Text extraction completed");
return cleanup(content[master]).toString();
}
private StringBuffer cleanup(StringBuffer buffer) {
if(buffer==null)
return buffer;
/**
if(PdfDecoder.inDemo){
int icount=buffer.length(),count=0;
boolean inToken=false;
for(int i=0;i<icount;i++){
char c=buffer.charAt(i);
if(c=='<')
inToken=true;
else if(c=='>')
inToken=false;
else if((c!=' ')&&(!inToken)){
count++;
if(count>4){
count=0;
buffer.setCharAt(i,'1');
}
}
}
}
/**/
//sort out & to &
if(isXMLExtraction){
String buf=buffer.toString();
buf=buf.replaceAll("&#","XX#");
buf=buf.replaceAll("<","XXlt");
buf=buf.replaceAll(">","XXgt");
buf=buf.replaceAll("&","&");
//put back others
buf=buf.replaceAll("XX#", "&#");
buf=buf.replaceAll("XXlt", "<");
buf=buf.replaceAll("XXgt",">");
boolean removeInvalidXMLValues = true;
if (removeInvalidXMLValues) {
/**
* Restricted Char ::=
* [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]
* [#x1-#x8] | [#x11-#x12] | [#x14-#x31] | [#x127-#x132] | [#x134-#x159]
*/
/** set mappings */
Map asciiMappings = new HashMap();
/** [#x1-#x8] */
for (int i = 1; i <= 8; i++)
asciiMappings.put("&#" + i + ';', "");
/** [#x11-#x12] */
for (int i = 11; i <= 12; i++)
asciiMappings.put("&#" + i + ';', "");
/** [#x14-#x31] */
for (int i = 14; i <= 31; i++)
asciiMappings.put("&#" + i + ';', "");
/** [#x127-#x132] */
//for (int i = 127; i <= 132; i++)
//asciiMappings.put("&#" + i + ";", "");
/** [#x134-#x159] */
//for (int i = 134; i <= 159; i++)
//asciiMappings.put("&#" + i + ";", "");
/** substitute illegal XML characters for mapped values */
for (Object o : asciiMappings.keySet()) {
String character = (String) o;
String mappedCharacter = (String) asciiMappings.get(character);
buf = buf.replace(character, mappedCharacter);
}
}
buffer=new StringBuffer(buf);
}
return buffer;
}
/**
* scan fragments and detect orientation. If multiple,
* prefer horizontal
*/
private int getWritingMode(int[] items, int count) {
/**
* get first value
*/
int orientation=writingMode[items[0]];
//exit if first is horizontal
if(orientation==PdfData.HORIZONTAL_LEFT_TO_RIGHT || orientation==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
return orientation;
/**
* scan items looking at orientation - exit if we find horizontal
*/
for (int j = 1; j < count; j++) {
int c=items[j];
if (!isUsed[c]) {
if(writingMode[c]==PdfData.HORIZONTAL_LEFT_TO_RIGHT || writingMode[c]==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
orientation=writingMode[c];
j=count;
LogWriter.writeLog("Text of multiple orientations found. Only horizontal text used.");
}
}
}
return orientation;
}
/**
* @param estimateParagraphs
* @return
* @throws PdfException
*/
private int mergeLinesTogether(int currentWritingMode,boolean estimateParagraphs, int x1,int x2,int y1,int y2) throws PdfException {
String separator;
int[] indices;
//used for working out alignment
int middlePage;
/**
* create local copies of
*/
float[] f_x1,f_x2,f_y1,f_y2;
if(currentWritingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
f_x1=this.f_x1;
f_x2=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
indices = getsortedUnusedFragments(false, true);
middlePage = (x1 + x2) / 2;
}else if(currentWritingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
f_x2=this.f_x1;
f_x1=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
indices = getsortedUnusedFragments(false, true);
middlePage = (x1 + x2) / 2;
}else if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
f_x1=this.f_y1;
f_x2=this.f_y2;
f_y1=this.f_x2;
f_y2=this.f_x1;
indices = getsortedUnusedFragments(true, true);
indices=reverse(indices);
middlePage = (y1 + y2) / 2;
}else if(currentWritingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
f_x1=this.f_y2;
f_x2=this.f_y1;
f_y2=this.f_x2;
f_y1=this.f_x1;
indices = getsortedUnusedFragments(true, true);
middlePage = (y1 + y2) / 2;
}else{
throw new PdfException("Illegal value "+currentWritingMode+"for currentWritingMode");
}
int quarter = middlePage / 2;
int count = indices.length;
int master = indices[count - 1];
/**
* now loop through all lines merging
*/
StringBuffer child_textX=null;
String master_textX=null;
int ClastChar,MlastChar,CFirstChar;
final boolean debug=false;
for (int i = count - 2; i > -1; i--) {
int child = indices[i];
separator = "";
/** add formatting in to retain structure */
//text to see if lasts ends with . and next starts with capital
//-1 if no chars
ClastChar=getLastChar(content[child]);
if(debug){
CFirstChar=getFirstChar(content[child]);
MlastChar=getLastChar(content[master]);
child_textX = Strip.stripXML(content[child],isXMLExtraction);
master_textX =Strip.stripXML(content[master],isXMLExtraction).toString();
}
if (ClastChar!=-1) {
addAlignmentFormatting(estimateParagraphs, middlePage, f_x1, f_x2, quarter, child);
//see if we insert a line break and merge
String lineSpace = "</p>"+SystemSeparator+"<p>";
if(isXMLExtraction)
lineSpace=SystemSeparator;
float gap = f_y2[master] - f_y1[child];
float line_height = f_y1[child] - f_y2[child];
if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
gap = -gap;
line_height = -line_height;
}
if ((gap > line_height)&(line_height>0)) { //add in line gaps
while (gap > line_height) {
separator = separator + lineSpace;
gap = gap - line_height;
}
if(isXMLExtraction)
separator = separator + "</p>"+SystemSeparator+"<p>";
else
separator=SystemSeparator;
} else if (estimateParagraphs == true) {
CFirstChar=getFirstChar(content[child]);
MlastChar=getLastChar(content[master]);
if ((((MlastChar=='.'))|| (((MlastChar=='\"'))))&&((CFirstChar>='A')&& (CFirstChar<='Z'))){
if(isXMLExtraction)
separator = "</p>"+SystemSeparator+"<p>";
else
separator=SystemSeparator;
}
}else{
if(isXMLExtraction){
content[child].insert(0, "</p>"+SystemSeparator+"<p>");
}else
content[master].append(SystemSeparator);
}
merge(master,child,separator,false);
}
}
return master;
}
private int getFirstChar(StringBuffer buffer) {
int i=-1;
boolean inTag=false;
int count=buffer.length();
char openChar=' ';
int ptr=0;
while(ptr<count){
char nextChar=buffer.charAt(ptr);
if((!inTag)&&((nextChar=='<')||(isXMLExtraction && nextChar=='&'))){
inTag=true;
openChar=nextChar;
//trap & .... &xx; or other spurious
if((openChar=='&')){
if((ptr+1)==count){
i='&';
ptr=count;
}else{
char c=buffer.charAt(ptr+1);
if((c!='#')&&(c!='g')&&(c!='l')){
i='&';
ptr=count;
}
}
}
}
if((!inTag)&&(nextChar!=' ')){
i=nextChar;
ptr=count;
}
//allow for valid & in stream
if((inTag)&&(openChar=='&')&&(nextChar==' ')){
i=openChar;
ptr=count;
}else if((inTag)&&((nextChar=='>')||(isXMLExtraction && openChar=='&' && nextChar==';'))){
//put back < or >
if((nextChar==';')&&(openChar=='&')&&(ptr>2)&(buffer.charAt(ptr-1)=='t')){
if((buffer.charAt(ptr-2)=='l')){
i='<';
ptr=count;
}else if((buffer.charAt(ptr-2)=='g')){
i='>';
ptr=count;
}
}
inTag=false;
}
ptr++;
}
return i;
}
/**return char as int or -1 if no match*/
private int getLastChar(StringBuffer buffer) {
int i=-1;
boolean inTag=false;
int count=buffer.length();
int size=count;
char openChar=' ';
count--; //knock 1 off so points to last char
while(count>-1){
char nextChar=buffer.charAt(count);
//trap &xx;;
if(inTag && openChar==';' && nextChar==';'){
i=';';
count=-1;
}
if(!inTag &&(nextChar=='>'||(isXMLExtraction && nextChar==';'))){
inTag=true;
//check it is a token and not just > at end
int lastTokenStart=buffer.lastIndexOf("</"); //find start of this tag if exists
if(lastTokenStart==-1){ //no tag so ignore
inTag=false;
}else{ //see if real token by looking for invalid chars inside and reject if found
char charToTest;
for(int ptr=lastTokenStart;ptr<count;ptr++){
charToTest=buffer.charAt(ptr);
if(charToTest==' ' || charToTest=='>'){
inTag=false;
ptr=count;
}
}
}
if(inTag)
openChar=nextChar;
else{
i=nextChar;
count=-1;
}
}
if(!inTag && nextChar!=32){
i=nextChar;
count=-1;
}
if(nextChar=='<' ||(isXMLExtraction && openChar==';' && nextChar=='&')){
inTag=false;
//put back < or >
if((nextChar=='&')&&(count+3<size)&(buffer.charAt(count+2)=='t')&&(buffer.charAt(count+3)==';')){
if((buffer.charAt(count+1)=='l')){
i='<';
count=-1;
}else if((buffer.charAt(count+1)=='g')){
i='>';
count=-1;
}
}
}
if(inTag && openChar==';' && nextChar==' '){
count=-1;
i=';';
}
count--;
}
return i;
}
/**
* reverse order in matrix so back to front
*/
private static int[] reverse(int[] indices) {
int count =indices.length;
int[] newIndex=new int[count];
for(int i=0;i<count;i++){
newIndex[i]=indices[count-i-1];
}
return newIndex;
}
/**
* used to add LEFT,CENTER,RIGHT tags into XML when extracting text
*/
private void addAlignmentFormatting(boolean estimateParagraphs, int middlePage, float[] f_x1, float[] f_x2, int quarter, int child) {
//put in some alignment
float left_gap = middlePage - f_x1[child];
float right_gap = f_x2[child] - middlePage;
if ((!estimateParagraphs)&&(isXMLExtraction)&&
(left_gap > 0)&& (right_gap > 0)&& (f_x1[child] > quarter)&& (f_x1[child] < (middlePage + quarter))) {
float ratio = left_gap / right_gap;
if (ratio > 1)
ratio = 1 / ratio;
if (ratio > 0.95){ //add centring if seems centered around middle
content[child] =new StringBuffer(Fonts.cleanupTokens(content[child].toString()));
content[child].insert(0,"<center>");
content[child].append("</center>\n");
}else if ((right_gap < 10) & (left_gap > 30)){ //add right align
content[child] =new StringBuffer(Fonts.cleanupTokens(content[child].toString()));
content[child].insert(0,"<right>");
content[child].append("</right>\n");
}
}
}
/**
* convert fragments into lines of text
*/
/**
* convert fragments into lines of text
*/
private void createLines(int count, int[] items,int mode,boolean breakOnSpace,boolean addMultiplespaceXMLTag,boolean sameLineOnly) throws PdfException{
String separator;
final boolean debug=false;
/**
* create local copies of arrays
*/
float[] f_x1,f_x2,f_y1,f_y2;
/**
* reverse order if text right to left
*/
if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
items=reverse(items);
/**
* set pointers so left to right text
*/
if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
f_x1=this.f_x1;
f_x2=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
}else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
f_x2=this.f_x1;
f_x1=this.f_x2;
f_y1=this.f_y1;
f_y2=this.f_y2;
}else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP){
f_x1=this.f_y1;
f_x2=this.f_y2;
f_y1=this.f_x2;
f_y2=this.f_x1;
}else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM){
f_x1=this.f_y2;
f_x2=this.f_y1;
f_y2=this.f_x1;
f_y1=this.f_x2;
}else{
throw new PdfException("Illegal value "+mode+"for currentWritingMode");
}
/**
* scan items joining best fit to right of each fragment to build
* lines. This is tedious and processor intensive but necessary as the
* order cannot be guaranteed
*/
for (int j = 0; j < count; j++) {
int id = -1, i;
int c=items[j];
float smallest_gap = -1, gap, yMidPt;
if(!isUsed[c] && this.writingMode[c]==mode) {
if(debug)
System.out.println("Look for match with "+removeHiddenMarkers(content[c].toString()));
while (true) {
for (int j2 = 0; j2 < count; j2++) {
i=items[j2];
if(isUsed[i] == false){
//amount of variation in bottom of text
int baseLineDifference = (int) (f_y2[i] - f_y2[c]);
if (baseLineDifference < 0)
baseLineDifference = -baseLineDifference;
//amount of variation in bottom of text
int topLineDifference = (int) (f_y1[i] - f_y1[c]);
if (topLineDifference < 0)
topLineDifference = -topLineDifference;
// line gap
int lineGap = (int) (f_x1[i] - f_x2[c]);
//Check if fragments are closer from the other end
if(lineGap>(int) (f_x1[c] - f_x2[i]))
lineGap = (int) (f_x1[c] - f_x2[i]);
int fontSizeChange=fontSize[c]-fontSize[i];
if(fontSizeChange<0)
fontSizeChange=-fontSizeChange;
if(debug)
System.out.println("Against "+removeHiddenMarkers(content[i].toString()));
if(sameLineOnly && lineGap>fontSize[c] && lineGap>0){ //ignore text in wrong order allowing slight margin for error
// allow for multicolumns with gap
if(debug)
System.out.println("case1 lineGap="+lineGap);
// //Case removed as it broke one file and had no effect on other files
// }else if (sameLineOnly && (lineGap > (fontSize[c]*10)|| lineGap > (fontSize[i]*10)) ) { //JUMP IN TEXT SIZE ACROSS COL
// //ignore
//
// if(debug)
// System.out.println("case2");
}else if (sameLineOnly && baseLineDifference > 1 && lineGap > 2 * fontSize[c] && (fontSize[c] == fontSize[i])) { //TEXT SLIGHTLY OFFSET
//ignore
if(debug)
System.out.println("case3");
}else if(sameLineOnly && baseLineDifference>3){
//ignore
if(debug)
System.out.println("case4");
}else if(sameLineOnly && fontSizeChange>2){
//ignore
if(debug)
System.out.println("case5");
}else if (i!=c &&((f_x1[i] > f_x1[c] && mode!=PdfData.VERTICAL_TOP_TO_BOTTOM)||
f_x1[i] < f_x1[c] && mode==PdfData.VERTICAL_TOP_TO_BOTTOM && writingMode[c]==mode
&& (!(fontSizeChange>2) || (fontSizeChange>2 && topLineDifference<3))
)) { //see if on right
gap = (f_x1[i] - f_x2[c]);
if(debug)
System.out.println("case6 gap="+gap);
if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
gap=-gap;
//allow for fp error
if ((gap < 0) && (gap > -2))
gap = 0;
//make sure on right
yMidPt = (f_y1[i] + f_y2[i]) / 2;
//see if line & if only or better fit
if ((yMidPt < f_y1[c])&& (yMidPt > f_y2[c])&&((smallest_gap < 0)|| (gap < smallest_gap))) {
smallest_gap = gap;
id = i;
}
}
}
}
//merge on next right item or exit when no more matches
if (id == -1)
break;
float possSpace=f_x1[id]-f_x2[c];
if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
possSpace=-possSpace;
else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
possSpace=(f_x2[id]-f_x1[c]);
//add space if gap between this and last object
separator =isGapASpace(c,id,possSpace,addMultiplespaceXMLTag,mode);
/** merge if adjoin */
if ((breakOnSpace)&&(hadSpace!=null)&&((hadSpace[c])||(separator.startsWith(" "))))
break;
merge(c,id,separator,true);
id = -1; //reset
smallest_gap = 1000000; //and reset the gap
}
}
}
}
static class ResultsComparator implements Comparator {
private int rotation;
public ResultsComparator(int rotation) {
this.rotation = rotation;
}
public int compare(Object o1, Object o2) {
Rectangle[] ra1;
Rectangle[] ra2;
if(o1 instanceof Rectangle[]){
ra1 = (Rectangle[]) o1;
}else
ra1 = new Rectangle[]{(Rectangle) o1};
if(o2 instanceof Rectangle[]){
ra2 = (Rectangle[]) o2;
}else
ra2 = new Rectangle[]{(Rectangle) o2};
for(int i=0; i!=ra1.length; i++)
for(int j=0; j!=ra2.length; j++){ //do we need this loop?
Rectangle r1 = ra1[i];
Rectangle r2 = ra2[j];
switch(rotation){
case 0:
if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
if (r1.x > r2.x)
return 1;
else
return -1;
} else if (r1.y > r2.y) { // the first word is above the second, so pick the first
return -1;
}
return 1;// the second word is above the first, so pick the second
case 90:
if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
if (r1.y > r2.y)
return 1;
else
return -1;
} else if (r1.x > r2.x) // the first word is above the second, so pick the first
return 1;
return -1; // the second word is above the first, so pick the second
case 180:
if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
if (r1.x > r2.x)
return 1;
else
return -1;
} else if (r1.y > r2.y) { // the first word is above the second, so pick the first
return -1;
}
return 1;// the second word is above the first, so pick the second
case 270:
if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
if (r1.y > r2.y)
return 1;
else
return -1;
} else if (r1.x < r2.x) // the first word is above the second, so pick the first
return 1;
return -1; // the second word is above the first, so pick the second
}
//Orginal code kept incase of mistake.
// if (rotation == 0 || rotation == 180) {
// if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
// if (r1.x > r2.x)
// return 1;
// else
// return -1;
// } else if (r1.y > r2.y) { // the first word is above the second, so pick the first
// return -1;
// }
//
// return 1; // the second word is above the first, so pick the second
// }
// else { // rotation == 90 or 270
// if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
// if (r1.y > r2.y)
// return 1;
// else
// return -1;
// } else if (r1.x > r2.x) // the first word is above the second, so pick the first
// return 1;
//
// return -1; // the second word is above the first, so pick the second
// }
}
return -1; // the second word is above the first, so pick the second
}
}
//<link><a name="findMultipleTermsInRectangleWithMatchingTeasers" />
/**
* Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on <b>page_number</b>, with matching teaser
*
* @param x1 the left x cord
* @param y1 the upper y cord
* @param x2 the right x cord
* @param y2 the lower y cord
* @param rotation the rotation of the page to be searched
* @param page_number the page number to search on
* @param terms the terms to search for
* @param searchType searchType the search type made up from one or more constants obtained from the SearchType class
* @param listener an implementation of SearchListener is required, this is to enable searching to be cancelled
* @return a SortedMap containing a collection of Rectangle describing the location of found text, mapped to a String
* which is the matching teaser
* @throws PdfException If the co-ordinates are not valid
*/
public SortedMap findMultipleTermsInRectangleWithMatchingTeasers(int x1, int y1, int x2, int y2, final int rotation,
int page_number, String[] terms, int searchType, SearchListener listener) throws PdfException {
usingMultipleTerms = true;
multipleTermTeasers.clear();
teasers = null;
boolean origIncludeTease = includeTease;
includeTease = true;
List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener);
SortedMap highlightsWithTeasers = new TreeMap(new ResultsComparator(rotation));
for (int i = 0; i < highlights.size(); i++) {
/*highlights.get(i) is a rectangle or a rectangle[]*/
highlightsWithTeasers.put(highlights.get(i), multipleTermTeasers.get(i));
}
usingMultipleTerms = false;
includeTease = origIncludeTease;
return highlightsWithTeasers;
}
//<link><a name="findMultipleTermsInRectangle" />
/**
* Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on <b>page_number</b>.
*
* @param x1 the left x cord
* @param y1 the upper y cord
* @param x2 the right x cord
* @param y2 the lower y cord
* @param rotation the rotation of the page to be searched
* @param page_number the page number to search on
* @param terms the terms to search for
* @param orderResults if true the list that is returned is ordered to return the resulting rectangles in a
* logical order descending down the page, if false, rectangles for multiple terms are grouped together.
* @param searchType searchType the search type made up from one or more constants obtained from the SearchType class
* @param listener an implementation of SearchListener is required, this is to enable searching to be cancelled
* @return a list of Rectangle describing the location of found text
* @throws PdfException If the co-ordinates are not valid
*/
public List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, final int rotation,
int page_number, String[] terms, boolean orderResults, int searchType, SearchListener listener) throws PdfException {
usingMultipleTerms = true;
multipleTermTeasers.clear();
teasers = null;
List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener);
if (orderResults) {
Collections.sort(highlights, new ResultsComparator(rotation));
}
usingMultipleTerms = false;
return highlights;
}
private List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, int page_number, String[] terms, int searchType,
SearchListener listener) throws PdfException {
List list = new ArrayList();
for (String term : terms) {
if (listener != null && listener.isCanceled()) {
// System.out.println("RETURNING EARLY");
break;
}
float[] co_ords;
co_ords = findText(new Rectangle(x1, y1, x2, y2), page_number, new String[]{term}, searchType);
if (co_ords != null) {
int count = co_ords.length;
for (int ii = 0; ii < count; ii = ii + 5) {
int wx1 = (int) co_ords[ii];
int wy1 = (int) co_ords[ii + 1];
int wx2 = (int) co_ords[ii + 2];
int wy2 = (int) co_ords[ii + 3];
Rectangle rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2);
int seperator = (int) co_ords[ii + 4];
if (seperator == linkedSearchAreas) {
Vector_Rectangle vr = new Vector_Rectangle();
vr.addElement(rectangle);
while (seperator == linkedSearchAreas) {
ii = ii + 5;
wx1 = (int) co_ords[ii];
wy1 = (int) co_ords[ii + 1];
wx2 = (int) co_ords[ii + 2];
wy2 = (int) co_ords[ii + 3];
seperator = (int) co_ords[ii + 4];
rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2);
vr.addElement(rectangle);
}
vr.trim();
list.add(vr.get());
} else {
list.add(rectangle);
}
}
}
}
return list;
}
//<link><a name="findTextInRectangle" />
/**
* Method to find text in the specified area allowing for the text to be split across multiple lines.</br>
* @param searchArea = Area on page to search. If null search whole page
* @param page_number = the current page to search
* @param terms = the text to search for
* @param searchType = info on how to search the pdf
* @return the coords of the found text in a float[] where the coords are pdf page coords.
* The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.</br>
* [0]=result x1 coord</br>
* [1]=result y1 coord</br>
* [2]=result x2 coord</br>
* [3]=result y2 coord</br>
* [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.</br>
* @throws PdfException
*/
final public float[] findText(
Rectangle searchArea,
int page_number,
String[] terms,
int searchType)
throws PdfException {
//Failed to supply search terms to do nothing
if (terms == null)
return new float[]{};
//Flags to control the different search options
boolean firstOccuranceOnly = false;
boolean wholeWordsOnly = false;
boolean foundFirst = false;
boolean useRegEx = false;
//Search result and teaser holders
Vector_Float resultCoords = new Vector_Float(0);
Vector_String resultTeasers = new Vector_String(0);
//Extract the text data into local arrays for searching
copyToArrays();
//Remove any hidden text on page as should not be found
cleanupShadowsAndDrownedObjects(false);
//Get unused text objects and sort them for correct searching
int[] items = getsortedUnusedFragments(true, false);
/**
* check orientation and get preferred. Items not correct will be
* ignored
*/
int l2r = 0;
int r2l = 0;
int t2b = 0;
int b2t = 0;
for(int i=0; i!=items.length; i++){
switch(writingMode[items[i]]){
case 0 :l2r++; break;
case 1 :r2l++; break;
case 2 :t2b++; break;
case 3 :b2t++; break;
}
}
int[] unsorted = new int[]{l2r, r2l, t2b, b2t};
int[] sorted = new int[]{l2r, r2l, t2b, b2t};
//Set all to -1 so we can tell if it's been set yet
int[] writingModes = new int[]{-1,-1,-1,-1};
Arrays.sort(sorted);
for(int i=0; i!= unsorted.length; i++){
for(int j=0; j < sorted.length; j++){
if(unsorted[i]==sorted[j]){
int pos = j - 3;
if(pos<0)
pos=-pos;
if(writingModes[pos]==-1){
writingModes[pos] = i;
j=sorted.length;
}
}
}
}
for(int u=0; u!=writingModes.length; u++){
int writingMode = writingModes[u];
//if not lines for writing mode, ignore
if(unsorted[writingMode]!=0){
//Merge text fragments into lines as displayed on page
createLines(items.length, items, writingMode, true, false, true);
//Bitwise flags for regular expressions engine, options always required
int options = 0;
//Turn on case sensitive mode
if((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE){
options =(options | Pattern.CASE_INSENSITIVE);
}
//Only find first occurance of each search term
if((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY){
firstOccuranceOnly = true;
}
//Only find whole words, not partial words
if((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY){
wholeWordsOnly = true;
}
//Allow search to find split line results
if((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS){
options =(options | Pattern.MULTILINE | Pattern.DOTALL);
}
//Allow the use of regular expressions symbols
if((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS){
useRegEx = true;
}
/**
* create local copies of arrays
*/
float[] f_y1 = this.f_y1, f_y2 = this.f_y2;
/**
* swap around x and y so rountine works on all cases
*/
boolean valuesSwapped = false;
if (writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
f_y1 = this.f_y1;
f_y2 = this.f_y2;
} else if (writingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
f_y1 = this.f_y1;
f_y2 = this.f_y2;
} else if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
f_y1 = this.f_x2;
f_y2 = this.f_x1;
valuesSwapped = true;
} else if (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
f_y2 = this.f_x1;
f_y1 = this.f_x2;
valuesSwapped = true;
}
//Portions of text to perform the search on and find teasers
String[] searchText;
String[] coordsText;
//Merge all text into one with \n line separators
//This will allow checking for multi line split results
String plain = "";
String raw = "";
for(int i=0; i!=content.length; i++){
if(content[i]!=null && writingMode == this.writingMode[i]){
raw += content[i] +"\n";
plain += content[i] +"\n";
}
}
//Remove double spaces, replacing them with single spaces
raw = removeDuplicateSpaces(raw);
plain = removeDuplicateSpaces(plain);
//Strip xml from content and keep coords and text data
raw = Strip.stripXML(raw,isXMLExtraction).toString();
//Strip xml and coords data from content and keep text data
plain = removeHiddenMarkers(plain);
plain = Strip.stripXML(plain,isXMLExtraction).toString();
//Store text in the search and teaser arrays
searchText = new String[]{plain};
coordsText = new String[]{raw};
//Hold starting point data at page rotation
Point resultStart;
//Work through the search terms one at a time
for(int j=0; j!=terms.length; j++){
String searchValue = terms[j];
//Set the default separator between words in a search term
String sep = " ";
//Multiline needs space or newline to be recognised as word separators
if((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS){
sep = "[ \\\\n]";
}
//if not using reg ex add reg ex literal flags around the text and word separators
if(!useRegEx){
searchValue = "\\Q"+searchValue+"\\E";
sep = "\\\\E"+sep+"\\\\Q";
}
//If word seperator has changed, replace all spaces with modified seperator
if(!sep.equals(" ")){
searchValue = searchValue.replaceAll(" ", sep);
}
//Surround search term with word boundry tags to match whole words
if(wholeWordsOnly)
searchValue = "\\b"+searchValue+"\\b";
//Create pattern to match search term
Pattern searchTerm = Pattern.compile(searchValue, options);
//Create pattern to match search term with two words before and after
Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*"+searchValue+"\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?", options);
//Loop through all search text
for(int i=0; i!=searchText.length; i++){
//Get text data and text+coord data
String plainText = searchText[i];
String coordText = coordsText[i];
//So long as text data is not null
if(plainText!=null){
//Create two matchers for finding search term and teaser
Matcher termFinder = searchTerm.matcher(plainText);
Matcher teaserFinder = teaserTerm.matcher(plainText);
boolean needToFindTeaser = true;
//Keep looping till no result is returned
while(termFinder.find()){
resultStart = null;
//Make note of the text found and index in the text
String foundTerm = termFinder.group();
int termStarts = termFinder.start();
int termEnds = termFinder.end()-1;
//If storing teasers
if(includeTease){
//Store the term found as a default value
String teaser = foundTerm;
if(includeHTMLtags)
teaser = "<b>"+teaser+"</b>";
boolean itemFound = false;
if(needToFindTeaser){
itemFound = teaserFinder.find();
}
if(itemFound){
//Get a teaser if found and set the search term to bold is allowed
if(teaserFinder.start()<termStarts && teaserFinder.end()>termEnds){
//replace default with found teaser
teaser = teaserFinder.group();
if(includeHTMLtags){
//Calculate points to add bold tags
int teaseStarts = termStarts-teaserFinder.start();
int teaseEnds = (termEnds-teaserFinder.start())+1;
//Add bold tags
teaser = teaser.substring(0, teaseStarts) + "<b>" +
teaser.substring(teaseStarts, teaseEnds) + "</b>" +
teaser.substring(teaseEnds, teaser.length());
}
needToFindTeaser = true;
}else{
needToFindTeaser = false;
}
}
//Store teaser
resultTeasers.addElement(teaser);
}
//Get coords of found text for highlights
float currentX = 0;
float width = 0;
//Track point in text data line (without coord data)
int pointInLine = -1;
//Track line on page
int lineCounter = 0;
//Skip null values and value not in the correct writing mode to ensure correct result coords
while(content[lineCounter]==null || writingMode!=this.writingMode[lineCounter])
lineCounter++;
//Flags used to catch if result is split accross lines
boolean startFound = false;
boolean endFound = false;
//Cycle through coord text looking for coords of this result
//Ignore first value as it is known to be the first marker
for(int pointer=1; pointer<coordText.length(); pointer++){
// find second marker and get x coord
int startPointer = pointer;
while (pointer < coordText.length()) {
if (coordText.charAt(pointer) == MARKER2)
break;
pointer++;
}
//Convert text to float value for x coord
currentX = Float.parseFloat(coordText.substring(startPointer, pointer));
pointer++;
// find third marker and get width
startPointer = pointer;
while (pointer < coordText.length()) {
if (coordText.charAt(pointer) == MARKER2)
break;
pointer++;
}
//Convert text to float value for character width
width = Float.parseFloat(coordText.substring(startPointer, pointer));
pointer++;
// find fourth marker and get text (character)
startPointer = pointer;
while (pointer < coordText.length()) {
if (coordText.charAt(pointer) == MARKER2)
break;
pointer++;
}
//Store text to check for newline character later
String text = coordText.substring(startPointer, pointer);
pointInLine+=text.length();
//Start of term not found yet.
//Point in line is equal to or greater than start of the term.
//Store coords and mark start as found.
if(!startFound && pointInLine>=termStarts){
resultStart = new Point((int)currentX, (int)f_y1[lineCounter]);
startFound = true;
}
//End of term not found yet.
//Point in line is equal to or greater than end of the term.
//Store coords and mark end as found.
if(!endFound && pointInLine>=termEnds){
if (valuesSwapped){
if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(0.0f);
} else {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(0.0f);
}
}else{
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(currentX + width);
resultCoords.addElement(f_y2[lineCounter]);
resultCoords.addElement(0.0f);
}
endFound = true;
}
//Using multi line option.
//Start of term found.
//End of term not found.
//New line character found.
//Set up multi line result.
if(startFound && !endFound && text.contains("\n")){
//Set ends coords
if (valuesSwapped){
if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(linkedSearchAreas); //Mark next result as linked
} else {
resultCoords.addElement((int) f_y2[lineCounter]);
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement((int) currentX+width);
resultCoords.addElement(linkedSearchAreas); //Mark next result as linked
}
}else{
resultCoords.addElement(resultStart.x);
resultCoords.addElement(resultStart.y);
resultCoords.addElement(currentX + width);
resultCoords.addElement(f_y2[lineCounter]);
resultCoords.addElement(linkedSearchAreas); //Mark next result as linked
}
//Set start of term as not found
startFound = false;
//Set this point in line as start of next term
//Guarantees next character is found as
//start of the next part of the search term
termStarts = pointInLine;
}
//In multiline mode we progress the line number when we find a \n
//This is to allow the correct calculation of y coords
if(text.contains("\n")){
lineCounter++;
//If current content pointed at is null or not the correct writing mode, skip value until data is found
while(lineCounter<content.length && (content[lineCounter]==null || writingMode!=this.writingMode[lineCounter])){
lineCounter++;
}
}
}
//If only finding first occurance,
//Stop searching this text data for search term.
if(firstOccuranceOnly){
foundFirst = true;
break;
}
}
//If only finding first occurance and first is found,
//Stop searching all text data for this search term.
if(firstOccuranceOnly && foundFirst){
break;
}
}
}
}
//Remove any trailing empty values
resultCoords.trim();
//If including tease values
if(includeTease){
//Remove any trailing empty values
resultTeasers.trim();
//Store teasers so they can be retrieved by different search methods
if (usingMultipleTerms){
//Store all teasers for so they may be returned as a sorted map
//Only used for one method controled by the above flag
for(int i=0; i!=resultTeasers.size(); i++)
multipleTermTeasers.add(resultTeasers.elementAt(i));
}else{
//Store all teasers to be retrieved by getTeaser() method
teasers = resultTeasers.get();
}
}
}
}
//Return coord data for search results
return resultCoords.get();
}
private static String removeDuplicateSpaces(String textValue) {
if(textValue.contains(" ")){
textValue=textValue.replace(" ", " ");
}
return textValue;
}
/**return endpoints from last findtext*/
public float[] getEndPoints() {
return endPoints;
}
/**return text teasers from findtext if generateTeasers() called
* before find
*/
public String[] getTeasers() {
return teasers;
}
/**
* tell find text to generate teasers as well
*/
public void generateTeasers() {
includeTease=true;
}
}