Package org.pdfbox.cos

Examples of org.pdfbox.cos.COSDocument


  * Convert a PDF file into text and save PDF fields in a IndexableDoc object
  */
public void getDocument(String ifile, IndexableDoc doc
{
  doc.setFileType("text"); doc.setFileName(ifile);
  COSDocument cosDoc = null;
  logger.info("Extracting text from PDF file " + ifile);
  try
  { cosDoc = parseDocument(new FileInputStream(new File(ifile)) ); }
  catch (OutOfMemoryError exc)
  {  logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage());
  return; }
  catch (IOException e)
  { logger.error("Cannot read PDF document " + ifile + " " +  e.getMessage());
  return; }
  catch (Exception e)
  { logger.error("Could not parse PDF document" + ifile + " " + e.getMessage());
  return; }

  // decrypt the PDF document, if it is encrypted -- use a blank password
  try
  {
   String password = "";
   if ( (cosDoc != null) && (cosDoc.isEncrypted()) )
   { DocumentEncryption decryptor = new DocumentEncryption(cosDoc); decryptor.decryptDocument(password); }
  }
  catch (CryptographyException e)
  { logger.error("Could not decrypt PDF doc: " + ifile + " " + e.getMessage()); closeCOSDocument(cosDoc)
  return; }
View Full Code Here


        {
            is = new java.io.FileInputStream(in);
            PDFParser parser = new PDFParser(is);
            parser.parse();

            COSDocument doc = parser.getDocument();

            os = new java.io.FileOutputStream(out);
            writer = new COSWriter(os);

            writer.write(doc);
View Full Code Here

        else
        {
                this.willEncrypt = false;
        }
               
        COSDocument cosDoc = document.getDocument();
        COSDictionary trailer = cosDoc.getTrailer();
        COSArray idArray = (COSArray)trailer.getDictionaryObject( "ID" );
        if( idArray == null )
        {
            try
            {
               
                //algothim says to use time/path/size/values in doc to generate
                //the id.  We don't have path or size, so do the best we can
                MessageDigest md = MessageDigest.getInstance( "MD5" );
                md.update( Long.toString( System.currentTimeMillis()).getBytes() );
                COSDictionary info = (COSDictionary)trailer.getDictionaryObject( "Info" );
                if( info != null )
                {
                    Iterator values = info.getValues().iterator();
                    while( values.hasNext() )
                    {
                        md.update( values.next().toString().getBytes() );
                    }
                }
                idArray = new COSArray();
                COSString id = new COSString( md.digest() );
                idArray.add( id );
                idArray.add( id );
                trailer.setItem( "ID", idArray );
            }
            catch( NoSuchAlgorithmException e )
            {
                throw new COSVisitorException( e );
            }
        }
       
        /*
        List objects = doc.getObjects();
        Iterator iter = objects.iterator();
        long maxNumber = 0;
        while( iter.hasNext() )
        {
            COSObject object = (COSObject)iter.next();
            if( object.getObjectNumber() != null &&
                object.getGenerationNumber() != null )
            {
                COSObjectKey key = new COSObjectKey( object.getObjectNumber().longValue(),
                                                     object.getGenerationNumber().longValue() );
                objectKeys.put( object.getObject(), key );
                objectKeys.put( object, key );
                maxNumber = Math.max( key.getNumber(), maxNumber );
                setNumber( maxNumber );
            }
        }*/
        cosDoc.accept(this);
    }
View Full Code Here

        {
            if ( raf == null )
            {
                if( tempDirectory != null )
                {
                    document = new COSDocument( tempDirectory );
                }
                else
                {
                    document = new COSDocument();
                }
            }
            else
            {
                document = new COSDocument( raf );
            }
            setDocument( document );
            String header = readLine();
            document.setHeaderString( header );
View Full Code Here

     *
     * @throws IOException If there is an error creating this document.
     */
    public FDFDocument() throws IOException
    {
        document = new COSDocument();
        document.setHeaderString( "%FDF-1.2" );

        //First we need a trailer
        document.setTrailer( new COSDictionary() );

View Full Code Here

     *
     * @throws IOException If there is an error creating this document.
     */
    public PDDocument() throws IOException
    {
        document = new COSDocument();

        //First we need a trailer
        COSDictionary trailer = new COSDictionary();
        document.setTrailer( trailer );

View Full Code Here

  public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
    try {
      PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
      parser.parse();
      COSDocument cosDoc = parser.getDocument();

      PDFTextStripper stripper = new PDFTextStripper();
      String docText = stripper.getText(new PDDocument(cosDoc));
      cosDoc.close();

      return new IndexDocument(fileData.path, docText, null);
    } catch (IOException e) {
      String msg = "Failed to write to the index";
      log.error(msg, e);
View Full Code Here

                is = resourceURL.openStream();
            }

            PDFParser parser = new PDFParser(is);
            parser.parse();
            COSDocument cosDoc = parser.getDocument();

            PDFTextStripper stripper = new PDFTextStripper();
            String docText = stripper.getText(new PDDocument(cosDoc));
            cosDoc.close();
            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", docText, Field.Store.NO, Field.Index.TOKENIZED));
            IndexWriter writer = new IndexWriter(RegistryContext.getBaseInstance().getJdbcDir(),
View Full Code Here

                                String license)
        throws PackageValidationException, CrosswalkException,
               AuthorizeException, SQLException, IOException
    {
        InputStream bis = null;
        COSDocument cos = null;
        boolean success = false;
        Bundle original = null;
        Bitstream bs = null;
        WorkspaceItem wi = null;

        /** XXX comment out for now
          // XXX for debugging of parameter handling
          if (params != null)
          {
              Enumeration pe = params.propertyNames();
              while (pe.hasMoreElements())
              {
                  String name = (String)pe.nextElement();
                  String v[] = params.getProperties(name);
                  StringBuffer msg = new StringBuffer("PackageParam: ");
                  msg.append(name).append(" = ");
                  for (int i = 0; i < v.length; ++i)
                  {
                      if (i > 0)
                          msg.append(", ");
                      msg.append(v[i]);
                  }
                  log.debug(msg);
              }
          }
        **/
          
        try
        {
            // Save the PDF in a bitstream first, since the parser
            // has to read it as well, and we cannot "rewind" it after that.
            wi = WorkspaceItem.create(context, collection, false);
            Item myitem = wi.getItem();
            original = myitem.createBundle("ORIGINAL");
            bs = original.createBitstream(pkg);
            pkg.close();
            bs.setName("package.pdf");
            setFormatToMIMEType(context, bs, "application/pdf");
            bs.update();
            log.debug("Created bitstream ID="+String.valueOf(bs.getID())+", parsing...");

            crosswalkPDF(context, myitem, bs.retrieve());

            wi.update();
            context.commit();
            success = true;
            log.info(LogManager.getHeader(context, "ingest",
                "Created new Item, db ID="+String.valueOf(myitem.getID())+
                ", WorkspaceItem ID="+String.valueOf(wi.getID())));
            return wi;
        }
        finally
        {
            try
            {
                // Close bitstream input stream and PDF file.
                if (bis != null)
                    bis.close();
                if (cos != null)
                    cos.close();
            }
            catch (IOException ie)
            { }

            // get rid of bitstream and item if ingest fails
View Full Code Here

    }

    private void crosswalkPDF(Context context, Item item, InputStream metadata)
        throws CrosswalkException, IOException, SQLException, AuthorizeException
    {
        COSDocument cos = null;

        try
        {
            PDFParser parser = new PDFParser(metadata);
            parser.parse();
            cos = parser.getDocument();

            // sanity check: PDFBox breaks on encrypted documents, so give up.
            if(cos.getEncryptionDictionary() != null)
                throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");

            /* PDF to DC "crosswalk":
             *
             * NOTE: This is not in a crosswalk plugin because (a) it isn't
             * useful anywhere else, and more importantly, (b) the source
             * data is not XML so it doesn't fit the plugin's interface.
             *
             * pattern of crosswalk -- PDF dict entries to DC:
             *   Title -> title.null
             *   Author -> contributor.author
             *   CreationDate -> date.created
             *   ModDate -> date.created
             *   Creator -> description.provenance (application that created orig)
             *   Producer -> description.provenance (convertor to pdf)
             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
            PDDocument pd = new PDDocument(cos);
            PDDocumentInformation docinfo = pd.getDocumentInformation();
            String title = docinfo.getTitle();

            // sanity check: item must have a title.
            if (title == null)
                throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
            log.debug("PDF Info dict title=\""+title+"\"");
            item.addDC("title", null, "en", title);
            String value;
            Calendar date;
            if ((value = docinfo.getAuthor()) != null)
            {
                item.addDC("contributor", "author", null, value);
                log.debug("PDF Info dict author=\""+value+"\"");
            }
            if ((value = docinfo.getCreator()) != null)
                item.addDC("description", "provenance", "en",
                              "Application that created the original document: "+value);
            if ((value = docinfo.getProducer()) != null)
                item.addDC("description", "provenance", "en",
                              "Original document converted to PDF by: "+value);
            if ((value = docinfo.getSubject()) != null)
                item.addDC("description", "abstract", null, value);
            if ((value = docinfo.getKeywords()) != null)
                item.addDC("subject", "other", null, value);

            // Take either CreationDate or ModDate as "date.created",
            // Too bad there's no place to put "last modified" in the DC.
            Calendar calValue;
            if ((calValue = docinfo.getCreationDate()) == null)
                calValue = docinfo.getModificationDate();
            if (calValue != null)
                item.addDC("date", "created", null,
                             (new DCDate(calValue.getTime())).toString());
            item.update();
        }
        finally
        {
            if (cos != null)
                cos.close();
        }
    }
View Full Code Here

TOP

Related Classes of org.pdfbox.cos.COSDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.