Package com.ibm.icu.text

Examples of com.ibm.icu.text.CharsetDetector


            }
        }

        // the author didn't tell us the encoding, try the mozilla-heuristic
        if (charset == null) {
          CharsetDetector det = new CharsetDetector();
          det.enableInputFilter(true);
          InputStream detStream = new BufferedInputStream(sourceStream);
          det.setText(detStream);
          charset = det.detect().getName();
          sourceStream = detStream;
        }
       
        // wtf? still nothing, just take system-standard
        if (charset == null) {
View Full Code Here


    String defaultCharset = Charset.defaultCharset().name();
    setSelectedItem(defaultCharset);
  }

  public String autoDetectEncoding(byte[] bytes) {
    CharsetDetector cd = new CharsetDetector();
    cd.setText(bytes);
    CharsetMatch charsetMatch = cd.detect();
    String charSet = charsetMatch.getName();

    int confidence = charsetMatch.getConfidence();
    logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
    setSelectedItem(charSet);
View Full Code Here

     * and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
     *
     * @return Reader to utf8 encoded reader.
     */
    public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
        CharsetDetector detector = new CharsetDetector();
   
        // Use the declared character encoding, if available
        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
   
        // CharsetDetector expects a stream to support marks
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
   
        detector.setText(stream);
   
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
       
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
View Full Code Here

                reader = ReaderFactory.newXmlReader( f );
                return ( (XmlStreamReader) reader ).getEncoding();
            }

            is = new BufferedInputStream( new FileInputStream( f ) );
            CharsetDetector detector = new CharsetDetector();
            detector.setText( is );
            CharsetMatch match = detector.detect();

            return match.getName().toUpperCase( Locale.ENGLISH );
        }
        catch ( IOException e )
        {
View Full Code Here

    if (assume88591IfNotUtf8) {
      return ISO_8859_1;
    }

    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
    CharsetDetector detector = new CharsetDetector();
    detector.setText(input);
    CharsetMatch match = detector.detect();
    return Charset.forName(match.getName().toUpperCase());
  }
View Full Code Here

  public static class FallbackEncodingDetector {
    public Charset detectEncoding(byte[] input) {
      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
      CharsetDetector detector = new CharsetDetector();
      detector.setText(input);
      CharsetMatch match = detector.detect();
      return Charset.forName(match.getName().toUpperCase());
    }
View Full Code Here

        if (!acceptsMimeType(response.getLastHeader("Content-Type"))) {
          return new RejectedMimeTypePage(url, status, response.getLastHeader("Content-Type").getValue());
        }

        if (Status.OK.equals(status)) {
          CharsetDetector detector = new CharsetDetector();
          detector.setText(read(response.getEntity().getContent()));
          CharsetMatch match = detector.detect();

          log.debug("Detected charset: " + match.getName());

          String content = match.getString();
          CharBuffer buffer = CharBuffer.wrap(content.toCharArray());
View Full Code Here

*/
public class CharsetDetectorIcu implements ICharsetDetector {

  public Collection<String> detectCharset(byte[] bytes) {
   
    CharsetDetector detector = new CharsetDetector();
    detector.setText(bytes);
   
    CharsetMatch[] matches = detector.detectAll();
    if ( matches == null || matches.length == 0 ) {
      return null;
    }
   
    Collection<String> charsets = new LinkedHashSet<String>();
View Full Code Here

        return filtered;
    }
   
    private CharsetMatch[] detect(byte[] bytes)
    {
        CharsetDetector det = new CharsetDetector();
       
        det.setText(bytes);
       
        return det.detectAll();
    }
View Full Code Here

        return det.detectAll();
    }
   
    private CharsetMatch[] detect(BufferedInputStream inputStream)
    {
        CharsetDetector det    = new CharsetDetector();
       
        try {
            det.setText(inputStream);
           
            return det.detectAll();
        } catch (Exception e) {
            // TODO: error message?
            return null;
        }
    }
View Full Code Here

TOP

Related Classes of com.ibm.icu.text.CharsetDetector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.