Package org.mozilla.universalchardet

Examples of org.mozilla.universalchardet.UniversalDetector


     * @param bytes 待检测的字节数组
     * @return 可能的字符集,如果检测失败,返回utf-8
     */
    public static String guessEncoding(byte[] bytes) {
        String DEFAULT_ENCODING = "UTF-8";
        UniversalDetector detector = new UniversalDetector(null);
        detector.handleData(bytes, 0, bytes.length);
        detector.dataEnd();
        String encoding = detector.getDetectedCharset();
        detector.reset();
        if (encoding == null) {
            encoding = DEFAULT_ENCODING;
        }
        return encoding;
    }
View Full Code Here


        byte[] buf = new byte[4096];
        String fileName = args[0];
        java.io.FileInputStream fis = new java.io.FileInputStream(fileName);

        // (1)
        UniversalDetector detector = new UniversalDetector(null);

        // (2)
        int nread;
        while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
            detector.handleData(buf, 0, nread);
        }
        // (3)
        detector.dataEnd();

        // (4)
        String encoding = detector.getDetectedCharset();
        if (encoding != null) {
            System.out.println("Detected encoding = " + encoding);
        } else {
            System.out.println("No encoding detected.");
        }

        // (5)
        detector.reset();
    }
View Full Code Here

  /**
   * Detect encoding by analyzing characters in the array
   */
  public static String detectEncoding(byte[] bytes) {
    String DEFAULT_ENCODING = "UTF-8";
    UniversalDetector detector = new UniversalDetector(null);
    detector.handleData(bytes, 0, bytes.length);
    detector.dataEnd();
    String encoding = detector.getDetectedCharset();
    detector.reset();
    if (encoding == null) {
      encoding = DEFAULT_ENCODING;
    } else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
      encoding = "windows-1252";
    }
View Full Code Here

   * Consumes a given {@link InputStream} and returns a string consisting of the
   * html code of the site.
   */
  public static String consumeStream(InputStream stream) throws IOException {
    try {
      UniversalDetector detector = new UniversalDetector(null);
      ReadableByteChannel bc = Channels.newChannel(stream);
      ByteBuffer buffer = ByteBuffer.allocate(BUFFER_SIZE);
      int read = 0;
      while ((read = bc.read(buffer)) != -1) {
        detector.handleData(buffer.array(), buffer.position() - read, read);
        buffer = resizeBuffer(buffer);
      }
      detector.dataEnd();
      // copy the result back to a byte array
      String encoding = detector.getDetectedCharset();
      return new String(buffer.array(), 0, buffer.position(),
          encoding == null ? "UTF-8" : encoding);
    } finally {
      if (stream != null) {
        stream.close();
View Full Code Here

   * @throws IOException
   */
  public static String getFileCharset(File file) throws IOException {
    byte[] buf = new byte[4096];
    BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(file));
    final UniversalDetector universalDetector = new UniversalDetector(null);

    int numberOfBytesRead;
    while ((numberOfBytesRead = bufferedInputStream.read(buf)) > 0 && !universalDetector.isDone()) {
      universalDetector.handleData(buf, 0, numberOfBytesRead);
    }

    universalDetector.dataEnd();
    bufferedInputStream.close();
    String encoding = universalDetector.getDetectedCharset();

    if (encoding != null) {
      logger.debug("Detected encoding for {} is {}.", file.getAbsolutePath(), encoding);
    } else {
      logger.debug("No encoding detected for {}.", file.getAbsolutePath());
    }

    universalDetector.reset();

    return encoding;
  }
View Full Code Here

    private ByteBuffer remaining = null;
    private boolean isFlushed = false;

    protected Decoder() {
      super(JUniversalChardetCharset.this, 1.0f, 2.0f);
      detector = new UniversalDetector(null);
    }
View Full Code Here

      return result;
    }

    @Override
    protected void implReset() {
      detector = new UniversalDetector(null);
      buffer = new ByteArrayOutputStream();
      usedDecoder = null;
      remaining = null;
      isFlushed = false;
    }
View Full Code Here

  protected String getCharsetFromBytes(InputStream resource) throws IOException {
    String charsetName;

    byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
    // (1)
    UniversalDetector detector = new UniversalDetector(null);

    // (2)
    resource.mark(MAX_CHARSET_READAHEAD);
    int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
    resource.reset();
    detector.handleData(bbuffer, 0, len);
    // (3)
    detector.dataEnd();
    // (4)
    charsetName = detector.getDetectedCharset();

    // (5)
    detector.reset();
    if(isCharsetSupported(charsetName)) {
      return charsetName;
    }
    return null;
  }
View Full Code Here

TOP

Related Classes of org.mozilla.universalchardet.UniversalDetector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.