
/** UTF8Magic is a simple class for parsing UTF-8 with fallback. 
 * <p>
 * It is designed to be completely self contained, very simple to use, and fit
 * easily into any existing project.
 * <p>
 * This class is in public domain, yay!
 * <p>
 * @author Vidar koala_man Holen
 */

public class UTF8Magic { 
    /** Parse a String read as Latin1, and conditionally convert from UTF-8. 
     * <p>
     * doMagic will parse a String that was read as Latin1 (ISO-8859-1) to see
     * if it's valid UTF-8. If it is, it will be converted to a proper unicode
     * String and returned. If it's not valid, it will simply be returned as it
     * is.
     * <p>
     * Example:
     * <pre>
     * myString = myBufferedReader.readLine(); 
     * myString = UTF8Magic.doMagic(myString); 
     * parse(myString); 
     * </pre>
     * The parse-method will be called with a correct String, regardless of
     * whether the data is Latin1 or UTF-8. 
     * <p>
     * 
     * @param s The string to parse
     * @return the converted string, or s if it's not UTF-8.
     */
    public static String doMagic(String s) {
        if(s==null) return null;

        char[] data=s.toCharArray();
        int lpos, upos; /* Latin position, Unicode position. */
        char t; /* For building chars. */
        int count; /* Number of following bytes in character code. */
        boolean changed=false; /* If the string was modified. */

        for(lpos=upos=0; lpos<data.length; lpos++) {
            if(data[lpos]<0x80) {
                /* Yay, plain ascii */
                data[upos]=data[lpos];
                upos++;
            } else if(data[lpos]>0xFF) {
                /* Not Latin1 String! */
                return s;
            } else if(data[lpos]>0xEF) {
                /* Our chars are 16 bit, so these won't work anyways. */
                return s;
            } else {
                t=data[lpos];
                if((t|0xE0)==t) count=2; /* Two additional bytes. */
                else if((t|0xC0)==t) count=1; /* Just one additional. */
                else return s; /* Not valid UTF-8. */
                
                if(lpos+count>=data.length) 
                    return s; /* We're missing bytes. */
                
                t=(char)(t&(((1<<(6-count))-1)));
                
                for(int i=1; i<=count; i++) {
                    if(data[lpos+i]>0xBF) 
                        return s; /* Invalid follow-up char. */
                    t=(char)((t<<6)|(data[lpos+i]&0x3F));
                }
                
                lpos+=count;
                data[upos]=t;
                upos++;
                changed=true;
            }
        }
        if(changed)
            return new String(data,0,upos);
        else return s;
    }


    /* v--- Cut here ---v */
    /** Run some simple validity and speed tests on <tt>doMagic</tt>. 
     * <p>
     * This method is not required and can be edited out.
     */
    public static void main(String[] args) {
        String utf8=doMagic("\u00c3\u00a6\u00c3\u00b8\u00c3\u00a5\u00c3\u0086\u00c3\u0098\u00c3\u0085");
        String latin1=doMagic("\u00e6\u00f8\u00e5\u00c6\u00d8\u00c5");

        if(utf8.equals(latin1)) {
            System.out.println("The strings are equal, the doMagic did magic!");

            System.out.println("1M doMagics(latin1) takes "+time(latin1)+"ms");
            System.out.println("1M doMagics(utf-8)  takes "+time(utf8)+  "ms");
            System.out.println("1M doMagics(ascii)  takes "+time("aeoEOE")+"ms");
            
        } else {
            System.out.println("It doesn't work, koala_man is a jackass :(");
        }
            
    }

    /** Return the time in milliseconds to convert the string a million times. 
     * <p>
     * This method is not required and can be edited out.
     */
    public static long time(String s) {
            long l=System.currentTimeMillis();
            int i;
            for(i=0; i<1000000; i++) {
                doMagic(s);
            }
            l=System.currentTimeMillis()-l;
            return l;
    }
    /* ^--- Cut here ---^ */
    
}
