Examples of com.ibm.icu.text.CharsetDetector

com.ibm.icu.text.CharsetDetector
CharsetDetector provides a facility for detecting the charset or encoding of character data in an unknown format. The input data can either be from an input stream or an array of bytes. The result of the detection operation is a list of possibly matching charsets, or, for simple use, you can just ask for a Java Reader that will will work over the input data.
Character set detection is at best an imprecise operation. The detection process will attempt to identify the charset that best matches the characteristics of the byte data, but the process is partly statistical in nature, and the results can not be guaranteed to always be correct.
For best accuracy in charset detection, the input data should be primarily in a single language, and a minimum of a few hundred bytes worth of plain text in the language are needed. The detection process will attempt to ignore html or xml style markup that could otherwise obscure the content.
@stable ICU 3.4

        }
    }
    
    public void TestConstruction() {
        int i;
        CharsetDetector  det = new CharsetDetector();
        if(det==null){
            errln("Could not construct a charset detector");
        }
        String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
        CheckAssert(charsetNames.length != 0);

View Full Code Here


    public void TestInputFilter() throws Exception
    {
        String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
        byte[] bytes = s.getBytes("ISO-8859-1");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        
        det.enableInputFilter(true);
        if (!det.inputFilterEnabled()){
            errln("input filter should be enabled");
        }
        
        det.setText(bytes);
        m = det.detect();
        
        if (! m.getLanguage().equals("fr")) {
            errln("input filter did not strip markup!");
        }
        
        det.enableInputFilter(false);
        det.setText(bytes);
        m = det.detect();
        
        if (! m.getLanguage().equals("en")) {
            errln("unfiltered input did not detect as English!");
        }
    }

View Full Code Here

        String  s = "This is a string with some non-ascii characters that will " +
                    "be converted to UTF-8, then shoved through the detection process.  " +
                    "\u0391\u0392\u0393\u0394\u0395" +
                    "Sure would be nice if our source could contain Unicode directly!";
        byte [] bytes = s.getBytes("UTF-8");
        CharsetDetector det = new CharsetDetector();
        String retrievedS;
        Reader reader;
        
        retrievedS = det.getString(bytes, "UTF-8");
        CheckAssert(s.equals(retrievedS));
        
        reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
        CheckAssert(s.equals(stringFromReader(reader)));
        det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
    }

View Full Code Here

                "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
                "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
        
        byte[] beBytes = source.getBytes("UnicodeBig");
        byte[] leBytes = source.getBytes("UnicodeLittle");
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        
        det.setText(beBytes);
        m = det.detect();
        
        if (! m.getName().equals("UTF-16BE")) {
            errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
        }
        
        det.setText(leBytes);
        m = det.detect();
        
        if (! m.getName().equals("UTF-16LE")) {
            errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
        }

View Full Code Here

            "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";


        byte[] bISO     = sISO.getBytes("ISO-8859-1");
        byte[] bWindows = sWindows.getBytes("windows-1252");
        
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        
        det.setText(bWindows);
        m = det.detect();
        
        if (m.getName() != "windows-1252") {
            errln("Text with C1 bytes not correctly detected as windows-1252.");
            return;
        }
        
        det.setText(bISO);
        m = det.detect();
        
        if (m.getName() != "ISO-8859-1") {
            errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
        }
    }

View Full Code Here

                {(byte)'A', (byte)'B'},
                {(byte)'A', (byte)'B', (byte)'C'},
                {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
            };
        
        CharsetDetector det = new CharsetDetector();
        CharsetMatch m;
        for (int i=0; i<shortBytes.length; i++) {
            det.setText(shortBytes[i]);
            m = det.detect();
            logln("i=" + i + " -> " + m.getName());
        }
    }

View Full Code Here

            null,
            null,
            "ISO-8859-1"
        };
        
        CharsetDetector det = new CharsetDetector();
        CharsetMatch match;


        det.setDeclaredEncoding("ISO-2022-JP");


        for (int idx = 0; idx < testStrings.length; idx += 1) {
            det.setText(testStrings[idx]);
            match = det.detect();


            if (match == null) {
                if (testResults[idx] != null) {
                    errln("Unexpectedly got no results at index " + idx);
                }

View Full Code Here

        if (split.length > 1) {
            lang = split[1];
        }


        try {
            CharsetDetector det = new CharsetDetector();
            byte[] bytes;
            
            //if (enc.startsWith("UTF-32")) {
            //    UTF32 utf32 = UTF32.getInstance(enc);
                
            //    bytes = utf32.toBytes(testString);
            //} else {
                String from = enc;


                while (true) {
                    try {
                        bytes = testString.getBytes(from);
                    } catch (UnsupportedOperationException uoe) {
                         // In some runtimes, the ISO-2022-CN converter
                         // only converts *to* Unicode - we have to use
                         // x-ISO-2022-CN-GB to convert *from* Unicode.
                        if (from.equals("ISO-2022-CN")) {
                            from = "x-ISO-2022-CN-GB";
                            continue;
                        }
                        
                        // Ignore any other converters that can't
                        // convert from Unicode.
                        return;
                    } catch (UnsupportedEncodingException uee) {
                        // Ignore any encodings that this runtime
                        // doesn't support.
                        return;
                    }
                    
                    break;
                }
            //}
        
            det.setText(bytes);
            checkMatch(det, testString, enc, lang, id);
            
            det.setText(new ByteArrayInputStream(bytes));
            checkMatch(det, testString, enc, lang, id);
         } catch (Exception e) {
            errln(id + ": " + e.toString() + "enc=" + enc);
            e.printStackTrace();
        }

View Full Code Here

        CheckAssert(charsetMatch.equals("IBM420_ltr"));


    }


    private CharsetMatch _testIBM420_ar_rtl(String s, CharsetEncoder encoder) throws Exception {
        CharsetDetector det = new CharsetDetector();
        det.setText(encoder.encode(CharBuffer.wrap(s)).array());
        CharsetMatch m = det.detect();
        return m;
    }

View Full Code Here

         */    
        
        StringBuffer ltrStrBuf = new StringBuffer(s);
        ltrStrBuf = ltrStrBuf.reverse();
        
        CharsetDetector det = new CharsetDetector();
        det.setText(encoder.encode(CharBuffer.wrap(ltrStrBuf.toString())).array());
        CharsetMatch m = det.detect();
        return m;
    }

View Full Code Here

0 1 2 3 4

TOP

Related Classes of com.ibm.icu.text.CharsetDetector

com.ibm.icu.dev.demo.charsetdet.DetectingViewer

com.ibm.icu.dev.test.charsetdet.TestCharsetDetector

com.nardoz.restopengov.utils.CSVFetcher

fr.eolya.utils.http.HttpStream

fr.eolya.utils.http.WebStream

net.geco.basics.GecoResources

net.sf.jmatchparser.util.charset.icu4jchardet.ICU4JChardetCharset$Decoder

net.vidageek.crawler.component.WebDownloader

net.yacy.document.parser.htmlParser

nu.validator.htmlparser.extra.IcuDetectorSniffer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.