Package com.ibm.icu.text

Examples of com.ibm.icu.text.CharsetDetector


    }

      //encoding detection
    try {
      BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
      CharsetDetector cd = new CharsetDetector();
      cd.setText(bis);
      CharsetMatch cm = cd.detect();
      if (cm != null) {
        format += "; charset=" + cm.getName();
      }
    } catch (IOException e) {
      log.error("Error detecting charset for '{}': {}", fileName, e.getMessage());
View Full Code Here


  private List<EncodingClue> clues;

  public EncodingDetector(Configuration conf) {
    minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
    detector = new CharsetDetector();
    clues = new ArrayList<EncodingClue>();
  }
View Full Code Here

  private EncodingDetector() {}

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }
View Full Code Here

        }

        // encoding detection
        // FIXME: is this required?
        try (BufferedInputStream bis = new BufferedInputStream(openStream(file))) {
            CharsetDetector cd = new CharsetDetector();
            cd.setText(bis);
            CharsetMatch cm = cd.detect();
            if (cm != null) {
                log.trace("Detected charset {} in {}", cm.getName(), file);
                format += "; charset=" + cm.getName();
            }
            bis.close();
View Full Code Here

    if (assume88591IfNotUtf8) {
      return "ISO-8859-1";
    }

    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return match.getName().toUpperCase();
  }
View Full Code Here

  private final List<EncodingClue> clues;

  public EncodingDetector(Configuration conf) {
    minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
    detector = new CharsetDetector();
    clues = new ArrayList<EncodingClue>();
  }
View Full Code Here

        return read(new FileInputStream(path));
    }

    public ICSVFetcherResult read(InputStream stream) throws Exception {

        CharsetDetector detector = new CharsetDetector();
        detector.setText(new BufferedInputStream(stream));

        CSVReader reader = new CSVReader(detector.detect().getReader(), separator);

        String[] keys = reader.readNext();
        String[] nextLine;

        callback.onStart();
View Full Code Here

  private EncodingDetector() {}
 
  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }
View Full Code Here

            }
        }

        // the author didn't tell us the encoding, try the mozilla-heuristic
        if (charset == null) {
            final CharsetDetector det = new CharsetDetector();
            det.enableInputFilter(true);
            final InputStream detStream = new BufferedInputStream(sourceStream);
            det.setText(detStream);
            charset = det.detect().getName();
            sourceStream = detStream;
        }

        // wtf? still nothing, just take system-standard
        if (charset == null) {
View Full Code Here

                return result;
            }
        }
        // in case of HTML or XML check whether there is a charset
        // specification; might be too fragile
        CharsetDetector detector = new CharsetDetector();
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
        detector.setText(in);
        CharsetMatch found = detector.detect();
        result = found.getName();
        LOG.debug("Encoding: " + result);
        return result;
    }
View Full Code Here

TOP

Related Classes of com.ibm.icu.text.CharsetDetector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.