Examples of org.apache.pdfbox.util.TextPosition

org.apache.pdfbox.util.TextPosition
This represents a string and a position on the screen of those characters. @author Ben Litchfield @version $Revision: 1.12 $

            int ltrCnt = 0;
            int rtlCnt = 0;


            while( textIter.hasNext() )
            {
                TextPosition position = (TextPosition)textIter.next();
                String stringValue = position.getCharacter();
                for (int a = 0; a < stringValue.length(); a++)
                {
                    byte dir = Character.getDirectionality(stringValue.charAt(a));
                    if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) ||
                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE ))
                    {
                        ltrCnt++;
                    }
                    else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE ))
                    {
                        rtlCnt++;
                    }
                }
            }


            // choose the dominant direction
            boolean isRtlDominant = rtlCnt > ltrCnt;


            startArticle(!isRtlDominant);
            startOfArticle = true;
            // we will later use this to skip reordering
            boolean hasRtl = rtlCnt > 0;


            /* Now cycle through to print the text.
             * We queue up a line at a time before we print so that we can convert
             * the line from presentation form to logical form (if needed). 
             */
            List<TextPosition> line = new ArrayList<TextPosition>();


            textIter = textList.iterator();    // start from the beginning again
            /* PDF files don't always store spaces. We will need to guess where we should add
             * spaces based on the distances between TextPositions. Historically, this was done
             * based on the size of the space character provided by the font. In general, this worked
             * but there were cases where it did not work. Calculating the average character width
             * and using that as a metric works better in some cases but fails in some cases where the
             * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples.
             */
            //Keeps track of the previous average character width
            float previousAveCharWidth = -1;
            while( textIter.hasNext() )
            {
                TextPosition position = (TextPosition)textIter.next();
                PositionWrapper current = new PositionWrapper(position);
                String characterValue = position.getCharacter();


                //Resets the average character width when we see a change in font
                // or a change in the font size
                if(lastPosition != null && ((position.getFont() != lastPosition.getTextPosition().getFont())
                        || (position.getFontSize() != lastPosition.getTextPosition().getFontSize())))
                {
                    previousAveCharWidth = -1;
                }


                float positionX;
                float positionY;
                float positionWidth;
                float positionHeight;


                /* If we are sorting, then we need to use the text direction
                 * adjusted coordinates, because they were used in the sorting. */
                if (getSortByPosition())
                {
                    positionX = position.getXDirAdj();
                    positionY = position.getYDirAdj();
                    positionWidth = position.getWidthDirAdj();
                    positionHeight = position.getHeightDir();
                }
                else
                {
                    positionX = position.getX();
                    positionY = position.getY();
                    positionWidth = position.getWidth();
                    positionHeight = position.getHeight();
                }


                //The current amount of characters in a word
                int wordCharCount = position.getIndividualWidths().length;


                /* Estimate the expected width of the space based on the
                 * space character with some margin. */
                float wordSpacing = position.getWidthOfSpace();
                float deltaSpace = 0;
                if ((wordSpacing == 0) || (wordSpacing == Float.NaN))
                {
                    deltaSpace = Float.MAX_VALUE;
                }

View Full Code Here

            //
            boolean suppressCharacter = false;
            float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
            {
                TextPosition character = sameTextCharacters.get( i );
                String charCharacter = character.getCharacter();
                float charX = character.getX();
                float charY = character.getY();
                //only want to suppress


                if( charCharacter != null &&
                        //charCharacter.equals( textCharacter ) &&
                        within( charX, textX, tolerance ) &&
                        within( charY,
                                textY,
                                tolerance ) )
                {
                    suppressCharacter = true;
                }
            }
            if( !suppressCharacter )
            {
                sameTextCharacters.add( text );
                showCharacter = true;
            }
        }


        if( showCharacter )
        {
            //if we are showing the character then we need to determine which
            //article it belongs to.
            int foundArticleDivisionIndex = -1;
            int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
            int notFoundButFirstLeftArticleDivisionIndex = -1;
            int notFoundButFirstAboveArticleDivisionIndex = -1;
            float x = text.getX();
            float y = text.getY();
            if( shouldSeparateByBeads )
            {
                for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
                {
                    PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
                    if( bead != null )
                    {
                        PDRectangle rect = bead.getRectangle();
                        if( rect.contains( x, y ) )
                        {
                            foundArticleDivisionIndex = i*2+1;
                        }
                        else if( (x < rect.getLowerLeftX() ||
                                y < rect.getUpperRightY()) &&
                                notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
                        {
                            notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
                        }
                        else if( x < rect.getLowerLeftX() &&
                                notFoundButFirstLeftArticleDivisionIndex == -1)
                        {
                            notFoundButFirstLeftArticleDivisionIndex = i*2;
                        }
                        else if( y < rect.getUpperRightY() &&
                                notFoundButFirstAboveArticleDivisionIndex == -1)
                        {
                            notFoundButFirstAboveArticleDivisionIndex = i*2;
                        }
                    }
                    else
                    {
                        foundArticleDivisionIndex = 0;
                    }
                }
            }
            else
            {
                foundArticleDivisionIndex = 0;
            }
            int articleDivisionIndex = -1;
            if( foundArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = foundArticleDivisionIndex;
            }
            else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
            }
            else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
            }
            else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
            }
            else
            {
                articleDivisionIndex = charactersByArticle.size()-1;
            }


            List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get( articleDivisionIndex );


            /* In the wild, some PDF encoded documents put diacritics (accents on
             * top of characters) into a separate Tj element.  When displaying them
             * graphically, the two chunks get overlayed.  With text output though,
             * we need to do the overlay. This code recombines the diacritic with
             * its associated character if the two are consecutive.
             */ 
            if(textList.isEmpty())
            {
                textList.add(text);
            }
            else
            {
                /* test if we overlap the previous entry.  
                 * Note that we are making an assumption that we need to only look back
                 * one TextPosition to find what we are overlapping.  
                 * This may not always be true. */
                TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
                if(text.isDiacritic() && previousTextPosition.contains(text))
                {
                    previousTextPosition.mergeDiacritic(text, normalize);
                }
                /* If the previous TextPosition was the diacritic, merge it into this
                 * one and remove it from the list. */
                else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                {
                    text.mergeDiacritic(previousTextPosition, normalize);
                    textList.remove(textList.size()-1);
                    textList.add(text);
                }

View Full Code Here

     * {@link #setListItemPatterns(List)}.
     * @param pw
     * @return
     */
    protected Pattern matchListItemPattern(PositionWrapper pw) {
        TextPosition tp = pw.getTextPosition();
        String txt = tp.getCharacter();
        Pattern p = matchPattern(txt,getListItemPatterns());
        return p;
    }

View Full Code Here

            int ltrCnt = 0;
            int rtlCnt = 0;


            while( textIter.hasNext() )
            {
                TextPosition position = (TextPosition)textIter.next();
                String stringValue = position.getCharacter();
                for (int a = 0; a < stringValue.length(); a++)
                {
                    byte dir = Character.getDirectionality(stringValue.charAt(a));
                    if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT ) ||
                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
                            (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE ))
                    {
                        ltrCnt++;
                    }
                    else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT ) ||
                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
                            (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE ))
                    {
                        rtlCnt++;
                    }
                }
            }


            // choose the dominant direction
            boolean isRtlDominant = rtlCnt > ltrCnt;


            startArticle(!isRtlDominant);
            startOfArticle = true;
            // we will later use this to skip reordering
            boolean hasRtl = rtlCnt > 0;


            /* Now cycle through to print the text.
             * We queue up a line at a time before we print so that we can convert
             * the line from presentation form to logical form (if needed). 
             */
            List<TextPosition> line = new ArrayList<TextPosition>();


            textIter = textList.iterator();    // start from the beginning again
            /* PDF files don't always store spaces. We will need to guess where we should add
             * spaces based on the distances between TextPositions. Historically, this was done
             * based on the size of the space character provided by the font. In general, this worked
             * but there were cases where it did not work. Calculating the average character width
             * and using that as a metric works better in some cases but fails in some cases where the
             * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples.
             */
            //Keeps track of the previous average character width
            float previousAveCharWidth = -1;
            while( textIter.hasNext() )
            {
                TextPosition position = (TextPosition)textIter.next();
                PositionWrapper current = new PositionWrapper(position);
                String characterValue = position.getCharacter();


                //Resets the average character width when we see a change in font
                // or a change in the font size
                if(lastPosition != null && ((position.getFont() != lastPosition.getTextPosition().getFont())
                        || (position.getFontSize() != lastPosition.getTextPosition().getFontSize())))
                {
                    previousAveCharWidth = -1;
                }


                float positionX;
                float positionY;
                float positionWidth;
                float positionHeight;


                /* If we are sorting, then we need to use the text direction
                 * adjusted coordinates, because they were used in the sorting. */
                if (getSortByPosition())
                {
                    positionX = position.getXDirAdj();
                    positionY = position.getYDirAdj();
                    positionWidth = position.getWidthDirAdj();
                    positionHeight = position.getHeightDir();
                }
                else
                {
                    positionX = position.getX();
                    positionY = position.getY();
                    positionWidth = position.getWidth();
                    positionHeight = position.getHeight();
                }


                //The current amount of characters in a word
                int wordCharCount = position.getIndividualWidths().length;


                /* Estimate the expected width of the space based on the
                 * space character with some margin. */
                float wordSpacing = position.getWidthOfSpace();
                float deltaSpace = 0;
                if ((wordSpacing == 0) || (wordSpacing == Float.NaN))
                {
                    deltaSpace = Float.MAX_VALUE;
                }

View Full Code Here

            //
            boolean suppressCharacter = false;
            float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
            {
                TextPosition character = sameTextCharacters.get( i );
                String charCharacter = character.getCharacter();
                float charX = character.getX();
                float charY = character.getY();
                //only want to suppress


                if( charCharacter != null &&
                        //charCharacter.equals( textCharacter ) &&
                        within( charX, textX, tolerance ) &&
                        within( charY,
                                textY,
                                tolerance ) )
                {
                    suppressCharacter = true;
                }
            }
            if( !suppressCharacter )
            {
                sameTextCharacters.add( text );
                showCharacter = true;
            }
        }


        if( showCharacter )
        {
            //if we are showing the character then we need to determine which
            //article it belongs to.
            int foundArticleDivisionIndex = -1;
            int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
            int notFoundButFirstLeftArticleDivisionIndex = -1;
            int notFoundButFirstAboveArticleDivisionIndex = -1;
            float x = text.getX();
            float y = text.getY();
            if( shouldSeparateByBeads )
            {
                for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
                {
                    PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
                    if( bead != null )
                    {
                        PDRectangle rect = bead.getRectangle();
                        if( rect.contains( x, y ) )
                        {
                            foundArticleDivisionIndex = i*2+1;
                        }
                        else if( (x < rect.getLowerLeftX() ||
                                y < rect.getUpperRightY()) &&
                                notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
                        {
                            notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
                        }
                        else if( x < rect.getLowerLeftX() &&
                                notFoundButFirstLeftArticleDivisionIndex == -1)
                        {
                            notFoundButFirstLeftArticleDivisionIndex = i*2;
                        }
                        else if( y < rect.getUpperRightY() &&
                                notFoundButFirstAboveArticleDivisionIndex == -1)
                        {
                            notFoundButFirstAboveArticleDivisionIndex = i*2;
                        }
                    }
                    else
                    {
                        foundArticleDivisionIndex = 0;
                    }
                }
            }
            else
            {
                foundArticleDivisionIndex = 0;
            }
            int articleDivisionIndex = -1;
            if( foundArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = foundArticleDivisionIndex;
            }
            else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
            }
            else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
            }
            else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
            {
                articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
            }
            else
            {
                articleDivisionIndex = charactersByArticle.size()-1;
            }


            List<TextPosition> textList = (List<TextPosition>) charactersByArticle.get( articleDivisionIndex );


            /* In the wild, some PDF encoded documents put diacritics (accents on
             * top of characters) into a separate Tj element.  When displaying them
             * graphically, the two chunks get overlayed.  With text output though,
             * we need to do the overlay. This code recombines the diacritic with
             * its associated character if the two are consecutive.
             */ 
            if(textList.isEmpty())
            {
                textList.add(text);
            }
            else
            {
                /* test if we overlap the previous entry.  
                 * Note that we are making an assumption that we need to only look back
                 * one TextPosition to find what we are overlapping.  
                 * This may not always be true. */
                TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
                if(text.isDiacritic() && previousTextPosition.contains(text))
                {
                    previousTextPosition.mergeDiacritic(text, normalize);
                }
                /* If the previous TextPosition was the diacritic, merge it into this
                 * one and remove it from the list. */
                else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                {
                    text.mergeDiacritic(previousTextPosition, normalize);
                    textList.remove(textList.size()-1);
                    textList.add(text);
                }

View Full Code Here

     * {@link #setListItemPatterns(List)}.
     * @param pw
     * @return
     */
    protected Pattern matchListItemPattern(PositionWrapper pw) {
        TextPosition tp = pw.getTextPosition();
        String txt = tp.getCharacter();
        Pattern p = matchPattern(txt,getListItemPatterns());
        return p;
    }

View Full Code Here

    prevLineY = lineY;


    float start = getFirstTrimmed(line).getX();
    //    leftMargin += start;
    incrementOrAdd(leftMargin, start);
    TextPosition lastTrimmed = getLastTrimmed(line);
    float end = lastTrimmed.getX() + lastTrimmed.getWidth();
    //    rightMargin += end;
    incrementOrAdd(rightMargin, end);


    Float fontSize;
    for (TextPosition t : line) {

View Full Code Here

  private String lastStyle = null;
  private float prevLineY = -1f;
  private boolean pageBreak = false;


  protected void printImage(List<TextPosition> line) throws IOException {
    TextPosition start = getFirstTrimmed(line);
    float y = start.getY();
    for (Entry<Float, Image> entry : pageImages.entrySet()) {
      if (entry.getKey() < y) {
        Image image = entry.getValue();
        String name = imageStripper.printImage(image);
        pageImages.remove(entry.getKey());

View Full Code Here

    if (line.size() < 1) {
      return;
    }


    float start = -1;
    TextPosition firstText = getFirstTrimmed(line);
    start = firstText.getX();
    if (start == -1 || firstText.getCharacter().trim().isEmpty()) {
      return;
    }


    float end = -1;
    TextPosition lastText = getLastTrimmed(line);
    end = lastText.getX() + lastText.getWidth();
    if (end == -1 || lastText.getCharacter().trim().isEmpty()) {
      return;
    }


    if (start > maxLeftMargin /*&& end < minRightMargin*/) {
      // too much lineSpacing

View Full Code Here

       * if so, is it dominant. */
      int ltrCnt = 0;
      int rtlCnt = 0;


      while (textIter.hasNext()) {
        TextPosition position = textIter.next();
        String stringValue = position.getCharacter();
        for (int a = 0; a < stringValue.length(); a++) {
          byte dir = Character.getDirectionality(stringValue.charAt(a));
          if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT)
            || (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING)
            || (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)) {
            ltrCnt++;
          } else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
            || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
            || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
            || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
            rtlCnt++;
          }
        }
      }


      // choose the dominant direction
      boolean isRtlDominant = rtlCnt > ltrCnt;


      startArticle(!isRtlDominant);
      startOfArticle = true;
      // we will later use this to skip reordering
      boolean hasRtl = rtlCnt > 0;


      /* Now cycle through to print the text.
       * We queue up a line at a time before we print so that we can convert
       * the line from presentation form to logical form (if needed). 
       */
      List<TextPosition> line = new ArrayList<TextPosition>();


      textIter = textList.iterator(); // start from the beginning again
      /* PDF files don't always store spaces. We will need to guess where we should add
       * spaces based on the distances between TextPositions. Historically, this was done
       * based on the size of the space character provided by the font. In general, this worked
       * but there were cases where it did not work. Calculating the average character width
       * and using that as a metric works better in some cases but fails in some cases where the
       * spacing worked. So we use both. NOTE: Adobe reader also fails on some of these examples.
       */
      //Keeps track of the previous average character width
      float previousAveCharWidth = -1;
      while (textIter.hasNext()) {
        TextPosition position = textIter.next();
        PositionWrapper current = new PositionWrapper(position);
        String characterValue = position.getCharacter();


        //Resets the average character width when we see a change in font
        // or a change in the font size
        if (lastPosition != null
          && ((position.getFont() != lastPosition.getTextPosition().getFont()) || (position.getFontSize() != lastPosition
            .getTextPosition().getFontSize()))) {
          previousAveCharWidth = -1;
        }


        float positionX;
        float positionY;
        float positionWidth;
        float positionHeight;


        /* If we are sorting, then we need to use the text direction
         * adjusted coordinates, because they were used in the sorting. */
        if (getSortByPosition()) {
          positionX = position.getXDirAdj();
          positionY = position.getYDirAdj();
          positionWidth = position.getWidthDirAdj();
          positionHeight = position.getHeightDir();
        } else {
          positionX = position.getX();
          positionY = position.getY();
          positionWidth = position.getWidth();
          positionHeight = position.getHeight();
        }


        //The current amount of characters in a word
        int wordCharCount = position.getIndividualWidths().length;


        /* Estimate the expected width of the space based on the
         * space character with some margin. */
        float wordSpacing = position.getWidthOfSpace();
        float deltaSpace = 0;
        if ((wordSpacing == 0) || (wordSpacing == Float.NaN)) {
          deltaSpace = Float.MAX_VALUE;
        } else {
          if (lastWordSpacing < 0) {

View Full Code Here

TOP

Related Classes of org.apache.pdfbox.util.TextPosition

net.timendum.pdf.LocalPDFTextStripper

net.timendum.pdf.PDFText2HTML

net.timendum.pdf.StatisticParser

org.apache.pdfbox.util.PDFTextStripper

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.