Package com.ibm.icu.text

Examples of com.ibm.icu.text.CharsetDetector


    }

    private CharsetMatch _test1256(String s) throws Exception {
       
        byte [] bytes = s.getBytes("windows-1256");
        CharsetDetector det = new CharsetDetector();
        det.setText(bytes);
        CharsetMatch m = det.detect();
        return m;
    }
View Full Code Here


        CheckAssert(charsetMatch.equals("IBM424_ltr"));
    }

    private CharsetMatch _test1255(String s) throws Exception {
        byte [] bytes = s.getBytes("ISO-8859-8");
        CharsetDetector det = new CharsetDetector();
        det.setText(bytes);
        CharsetMatch m = det.detect();
        return m;
    }
View Full Code Here

        return m;
    }
   
    private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
        byte [] bytes = s.getBytes("IBM424");       
        CharsetDetector det = new CharsetDetector();
        det.setText(bytes);
        CharsetMatch m = det.detect();
        return m;
    }
View Full Code Here

       
        StringBuffer ltrStrBuf = new StringBuffer(s);
        ltrStrBuf = ltrStrBuf.reverse();
        byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
       
        CharsetDetector det = new CharsetDetector();
        det.setText(bytes);
        CharsetMatch m = det.detect();
        return m;
    }
View Full Code Here

  private final List<EncodingClue> clues;

  public EncodingDetector(Configuration conf) {
    minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
    detector = new CharsetDetector();
    clues = new ArrayList<EncodingClue>();
  }
View Full Code Here

                return result;
            }
        }
        // in case of HTML or XML check whether there is a charset
        // specification; might be too fragile
        CharsetDetector detector = new CharsetDetector();
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
        detector.setText(in);
        CharsetMatch found = detector.detect();
        result = found.getName();
        LOG.debug("Encoding: " + result);
        return result;
    }
View Full Code Here

     * and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
     *
     * @return Reader to utf8 encoded reader.
     */
    public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
        CharsetDetector detector = new CharsetDetector();
   
        // Use the declared character encoding, if available
        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
   
        // CharsetDetector expects a stream to support marks
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
   
        detector.setText(stream);
   
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
       
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
View Full Code Here

  private List<EncodingClue> clues;

  public EncodingDetector(Configuration conf) {
    minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
    detector = new CharsetDetector();
    clues = new ArrayList<EncodingClue>();
  }
View Full Code Here

   * @return encoding name detected encoding name
   * @throws IOException
   *             occurs when the detection is failed.
   */
  public static String detectEncoding(byte[] data, String defaultEncoding) throws IOException {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();
    String estimatedEncoding = cm.getName();
    boolean isReliable = Charset.isSupported(estimatedEncoding) && cm.getConfidence() >= MINIMAL_CONFIDENCE_LEVEL;
    return isReliable ? estimatedEncoding : defaultEncoding;
  }
View Full Code Here

  private List<EncodingClue> clues;

  public EncodingDetector(Configuration conf) {
    minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
    detector = new CharsetDetector();
    clues = new ArrayList<EncodingClue>();
  }
View Full Code Here

TOP

Related Classes of com.ibm.icu.text.CharsetDetector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.