/*
 * Scone - The Web Enhancement Framework
 * Copyright (C) 2009 Harald Weinreich, Volkert Buchmann, Frank Wollenweber, Torsten Ha
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package  scone.util;


import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;


/**
 * tries to find out the language of a list of words.
 *
 * <br>
 * Currently supports, English, German and Spanish.<br>
 * example:
 * <code>
 * la=new LanguageAnalyzer();<br>
 * la.countWord("My");<br>
 * la.countWord("name");<br>
 * la.countWord("is");<br>
 * la.countWord("Volkert");<br>
 * System.out.println(la.getLanguage());	//will print "en"
 * </code>
 */
public class LanguageAnalyzer {
    // the keywords
    private Vector keys;
    // the languages of the keywords
    private Vector languages;
    // the hits keywords
    private IntHashtable keyCount;
    // counts the hits of the keywords of the languages
    private IntHashtable langCount;
    // the length of the longest keyword
    private int maxLength = 0;

    /**
     * creates a new LanguageAnalyzer
     */
    public LanguageAnalyzer() {
        // initialize variables
        keys = new Vector();
        languages = new Vector();
        keyCount = new IntHashtable();
        langCount = new IntHashtable();
        // define english from: The 100 most frequently used words: http://www.mcrel.org/products/literacy/100words.asp
        putLang("en", "the");
        putLang("en", "a");
        putLang("en", "and");
        putLang("en", "to");
        putLang("en", "I");
        putLang("en", "in");
        putLang("en", "is");
        putLang("en", "on");
        putLang("en", "you");
        putLang("en", "it");
        putLang("en", "of");
        putLang("en", "said");
        putLang("en", "can");
        putLang("en", "for");
        putLang("en", "my");
        putLang("en", "but");
        putLang("en", "all");
        putLang("en", "we");
        putLang("en", "are");
        putLang("en", "up");
        // define german: The most 200 frequent words in German: http://www-personal.umich.edu/~hmr/Miscellaneous/Frequent_Words.html
        putLang("de", "ich");
        putLang("de", "er");
        putLang("de", "sie");
        putLang("de", "es");
        putLang("de", "der");
        putLang("de", "die");
        putLang("de", "das");
        putLang("de", "ein");
        putLang("de", "eine");
        putLang("de", "dieser");
        putLang("de", "haben");
        putLang("de", "hat");
        putLang("de", "sein");
        putLang("de", "sind");
        putLang("de", "ist");
        putLang("de", "werden");
        putLang("de", "wird");
        putLang("de", "eins");
        putLang("de", "zwei");
        putLang("de", "drei");
        putLang("de", "bis");
        putLang("de", "aber");
        putLang("de", "denn");
        putLang("de", "und");
        // define spanish
        putLang("es", "la");
        putLang("es", "el");
        putLang("es", "un");
        putLang("es", "una");
        putLang("es", "y");
        putLang("es", "bien");
        putLang("es", "bueno");
        putLang("es", "con");
        putLang("es", "de");
        putLang("es", "del");
        putLang("es", "dos");
        putLang("es", "ella");
        putLang("es", "en");
        putLang("es", "esto");
        putLang("es", "hasta");
        putLang("es", "hoja");
        putLang("es", "mi");
        putLang("es", "mucho");
        putLang("es", "muy");
        putLang("es", "nada");
        putLang("es", "nuestro");
        putLang("es", "pues");
        putLang("es", "solo");
        putLang("es", "su");
        putLang("es", "vamos");
        putLang("es", "ver");
        putLang("es", "yo");
        // define french
        putLang("fr", "la");
        putLang("fr", "le");
        putLang("fr", "un");
        putLang("fr", "une");
        putLang("fr", "");
        putLang("fr", "on");
        putLang("fr", "bien");
        putLang("fr", "fort");
        putLang("fr", "le");
        putLang("fr", "les");
        putLang("fr", "faire");
        putLang("fr", "fait");
        putLang("fr", "prendre");
        putLang("fr", "prend");
        putLang("fr", "en");
        putLang("fr", "pas");
        putLang("fr", "je");
        putLang("fr", "tu");
        putLang("fr", "elle");
        putLang("fr", "il");
        putLang("fr", "soit");
        putLang("fr", "ca");
        putLang("fr", "est");
        putLang("fr", "elle");
        putLang("fr", "que");
    }

    /**
     * test - method
     */
    public static void main(String[] args) {
        LanguageAnalyzer la = new LanguageAnalyzer();

        la.countWord("the");
        la.countWord("tarzen");
        la.countWord("hier");
        la.countWord("der");
        la.countWord("el");
        la.countWord("wolkenkratzer");
        la.countWord("is");
        la.countWord("aber");
        System.out.println(la.getLanguage());
    }

    /**
     * adds a new keyword to a language.
     * <br>
     * Adds the language if it is new.<p>
     * No keyword may belong to more than one language!<br>
     * This class is optimized for short keywords.
     * @param lang the language
     * @param key the keyword, case insensitive.
     */
    public void putLang(String lang, String key) {
        key = " "+key.toLowerCase()+" ";
        int length = key.length();

        keys.addElement(key);
        languages.addElement(lang);
        keyCount.put(key, 0);
        langCount.put(lang, 0);
        if (maxLength < length) {
            maxLength = length;
        }
    }

    /**
     * counts how many times a word occurs
     * @param word the word to count
     */
    public void countWord(String word) {
        if (word.length() <= maxLength) {
            keyCount.inc(word.toLowerCase());
        }
    }

    /**
     * returns the language which had the most hits
     * @return the language
     */
    public String getLanguage() {
        int i = 0;
        Object max = null;
        int helper;
        Object helperObj = null;

        // counts the overall hits of each language
        for (int j = 0; j < keys.size(); j++) {
            langCount.add(languages.elementAt(j), keyCount.getInt(keys.elementAt(j)));
        }
        // now find out the key of the highest value and we know the language
        for (Enumeration e = langCount.keys(); e.hasMoreElements();) {
            helperObj = e.nextElement();
            helper = langCount.getInt(helperObj);
            if (helper > i && helper > 5) {                     // At least 5 hits, otherwise to unprecise!
                i = helper;
                max = helperObj;
            }
        }
        // now be sure to return a value
        if (max != null) {
            // System.out.println(max);
            return (String) max;
        } else {
            return  "";
        }
    }

    // hashtable with int values
    class IntHashtable extends Hashtable {

        // get integer value by key
        // return 0 if key is not in hashtable
        public int getInt(Object key) {
            if (get(key) != null) {
                return  ((Integer) get(key)).intValue();
            } else {
                return  0;
            }
        }

        // increases value at key by one
        public void inc(Object key) {
            add(key, 1);
        }

        // puts a new value into the hashtable
        public void put(Object key, int value) {
            this.put(key, new Integer(value));
        }

        // adds i to the value of key
        public void add(Object key, int i) {
            if (get(key) != null) {
                this.put(key, getInt(key) + i);
            } else {
                this.put(key, i);
            }
        }
    }
}

