Package com.ibm.icu.text

Examples of com.ibm.icu.text.CharsetDetector.detect()


        if (charset == null) {
          CharsetDetector det = new CharsetDetector();
          det.enableInputFilter(true);
          InputStream detStream = new BufferedInputStream(sourceStream);
          det.setText(detStream);
          charset = det.detect().getName();
          sourceStream = detStream;
        }
       
        // wtf? still nothing, just take system-standard
        if (charset == null) {
View Full Code Here


  }

  public String autoDetectEncoding(byte[] bytes) {
    CharsetDetector cd = new CharsetDetector();
    cd.setText(bytes);
    CharsetMatch charsetMatch = cd.detect();
    String charSet = charsetMatch.getName();

    int confidence = charsetMatch.getConfidence();
    logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
    setSelectedItem(charSet);
View Full Code Here

            stream = new BufferedInputStream(stream);
        }
   
        detector.setText(stream);
   
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
       
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
View Full Code Here

            }

            is = new BufferedInputStream( new FileInputStream( f ) );
            CharsetDetector detector = new CharsetDetector();
            detector.setText( is );
            CharsetMatch match = detector.detect();

            return match.getName().toUpperCase( Locale.ENGLISH );
        }
        catch ( IOException e )
        {
View Full Code Here

    }

    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return Charset.forName(match.getName().toUpperCase());
  }

  /**
   * A pretty good test that something is UTF-8. There are many sequences that will pass here that
View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }
  }

  /**
 
View Full Code Here

        }

        if (Status.OK.equals(status)) {
          CharsetDetector detector = new CharsetDetector();
          detector.setText(read(response.getEntity().getContent()));
          CharsetMatch match = detector.detect();

          log.debug("Detected charset: " + match.getName());

          String content = match.getString();
          CharBuffer buffer = CharBuffer.wrap(content.toCharArray());
View Full Code Here

        if (!det.inputFilterEnabled()){
            errln("input filter should be enabled");
        }
       
        det.setText(bytes);
        m = det.detect();
       
        if (! m.getLanguage().equals("fr")) {
            errln("input filter did not strip markup!");
        }
       
View Full Code Here

            errln("input filter did not strip markup!");
        }
       
        det.enableInputFilter(false);
        det.setText(bytes);
        m = det.detect();
       
        if (! m.getLanguage().equals("en")) {
            errln("unfiltered input did not detect as English!");
        }
    }
View Full Code Here

        byte[] leBytes = source.getBytes("UnicodeLittle");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
       
        det.setText(beBytes);
        m = det.detect();
       
        if (! m.getName().equals("UTF-16BE")) {
            errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
        }
       
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.