/*
 * Copyright (C) 2005  Thomas Mur
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  
 * USA
 *
 */

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

/**
 * @author Thomas Mur
 *
 */
public class WordCounter {

  // ----------------------------------------------------------------------------
  // ----------------------------------- main -----------------------------------
  // ----------------------------------------------------------------------------

  public static void main(String[] args) {
    if (args.length != 1) {
      System.out.println("Syntax: java WordCounter \"path/to/file.txt\"");
      System.exit(1);
    }
    try {
      String pathToTextFile = args[0];
      WordCounter wc = new WordCounter();
      wc.countAllWords(pathToTextFile);
      wc.writeSortedListInto("word_counter.txt");
    }
    catch (Throwable x) {
      x.printStackTrace();
      System.exit(2);
    }
  }
  
  // ----------------------------------------------------------------------------
  // ------------------------------- WordCounter --------------------------------
  // ----------------------------------------------------------------------------

  public void countAllWords(String pathToTextFile) {
    Map mapCounterByWord = parseText(pathToTextFile);
    // - create list
    List lstAllCounter = new ArrayList(mapCounterByWord.size());
    Iterator iter = mapCounterByWord.values().iterator();
    while (iter.hasNext()) {
      Counter counter = (Counter) iter.next();
      lstAllCounter.add(counter);
    }
    System.out.println("Words found: " + lstAllCounter.size());
    // - merge on name
    setSortOnWord(true);
    Collections.sort(lstAllCounter);
    List lstMergedCounter = new ArrayList();
    Counter lastCounter = null;
    iter = lstAllCounter.iterator();
    while (iter.hasNext()) {
      Counter counter = (Counter) iter.next();
      if (lastCounter == null || ! counter.tryToMergeIn(lastCounter)) {
        lstMergedCounter.add(counter);
        lastCounter = counter;
      }
    }
    System.out.println("Words after merging: " + lstMergedCounter.size());
    // - sort on counter
    setSortOnWord(false);
    Collections.sort(lstMergedCounter);
    // - end
    this.lstSortedCounter = lstMergedCounter;
  }

  public int getNumberOfWord() {
    return lstSortedCounter.size();
  }
  
  public void writeSortedListInto(String outputFileName) throws IOException {
    final String lineSeparator = System.getProperty("line.separator", "\n");
    Writer writer = new BufferedWriter(new OutputStreamWriter(
        new FileOutputStream(outputFileName, false), "ISO-8859-1"
        ));
    Iterator iter = lstSortedCounter.iterator();
    while (iter.hasNext()) {
      Counter counter = (Counter) iter.next();
      writer.write(counter.toString());
      writer.write(lineSeparator);
    }
    writer.flush();
    writer.close();
    System.out.println("File " + outputFileName + " is written.");
  }
  
  public void setSortOnWord(boolean b) {
    this.bSortOnWord = b;
  }
    
  // ----------------------------------------------------------------------------
  // --------------------------------- private ----------------------------------
  // ----------------------------------------------------------------------------

  private Map parseText(String pathToTextFile) {
    try {
      BufferedReader reader = new BufferedReader(new InputStreamReader(
          new FileInputStream(pathToTextFile), "ISO-8859-1"
          ));
      Map mapCounterByWord = new HashMap();
      String line;
      while ((line = reader.readLine()) != null) {
        StringTokenizer tokenizer = new StringTokenizer(line, " ,?;.:!*+\n\t\r\"\'()[]+");
        while (tokenizer.hasMoreTokens()) {
          String word = decodeWord(tokenizer.nextToken());
          Counter counter = (Counter) mapCounterByWord.get(word);
          if (counter == null) {
            counter = new Counter(word);
            mapCounterByWord.put(word, counter);
          }
          counter.increaseValue();
        }
      }
      return mapCounterByWord;
    }
    catch (FileNotFoundException x) {
      throw new WordCounterException("Cannot find the file: " + pathToTextFile);
    }
    catch (IOException x) {
      throw new WordCounterException(x);
    }
  }

  private static String decodeWord(String word)
  {
    if (isDecodedWord(word))
      return word;
    int length = word.length();
    // - check characters
    StringBuffer sb = new StringBuffer(length);
    for (int i = 0; i < length; i++) {
      char c = word.charAt(i);
      sb.append(decodeCharacter(c));
    }
    // - end
    return sb.toString();
  }

  private static boolean isDecodedWord(String word)
  {
    int length = word.length();
    for (int i = 0; i < length; i++) {
      char c = word.charAt(i);
      if (c != decodeCharacter(c))
        return false;
    }
    return true;
  }

  private static String cleanWord(String word)
  {
    if (isCleanedWord(word))
      return word;
    int length = word.length();
    // - check characters
    StringBuffer sb = new StringBuffer(length);
    for (int i = 0; i < length; i++) {
      char c = word.charAt(i);
      c = tryToDecodeCharacter(c);
      if (c != (char) -1)
        sb.append(tryToDecodeCharacter(c));
    }
    // - end
    return sb.toString();
  }

  private static boolean isCleanedWord(String word)
  {
    int length = word.length();
    for (int i = 0; i < length; i++) {
      char c = word.charAt(i);
      if (c != tryToDecodeCharacter(c))
        return false;
    }
    return true;
  }

  private static char decodeCharacter(char c)
  {
    char decoded = tryToDecodeCharacter(c);
    if (decoded == (char) -1) {
      // - log
      if (setNotDecodedCharacters.add(new Character(c)))
        System.err.println("Cannot decode character: " + c + " (" + (int) c + ")");
      decoded = c;
    }
    return decoded;
  }
  
  /**
   * @return <code>(char) -1</code> when the character is unkown
   */
  private static char tryToDecodeCharacter(char c)
  {
    if (c >= 'a' && c <= 'z')
      return c;
    if (c >= '0' && c <= '9')
      return c;
    if (c == '-' || c == '_' || c == '.' || c == '%' || c == '/')
      return c;
    if (c >= 'A' && c <= 'Z')
      return Character.toLowerCase(c);
    if (c == '\u00e9' || c == '\u00e8' || c == '\u00ea' || c == '\u00eb' 
        || c == '\u00c9' || c == '\u00c8' || c == '\u00ca' || c == '\u00cb')
      return 'e';
    else if (c == '\u00f9' || c == '\u00fc' || c == '\u00fb'
          || c == '\u00d9' || c == '\u00dc' || c == '\u00db')
      return 'u';
    else if (c == '\u00e0' || c == '\u00e4' || c == '\u00e2'
          || c == '\u00c0' || c == '\u00c4' || c == '\u00c2')
      return 'a';
    else if (c == '\u00e7' || c == '\u00c7')
      return 'c';
    else if (c == '\u00f4' || c == '\u00f6' || c == '\u00d4' || c == '\u00d6')
      return 'o';
    else if (c == '\u00ee' || c == '\u00ef' || c == '\u00ce' || c == '\u00cf')
      return 'i';
    return (char) -1;
  }
  
  // - variables
  private static Set setNotDecodedCharacters = new HashSet();

  // ----------------------------------------------------------------------------
  // -------------------------------- variables ---------------------------------
  // ----------------------------------------------------------------------------
  
  private boolean bSortOnWord = true;
  private List lstSortedCounter;

  // ############################################################################
  // ################################ Counter ###################################
  // ############################################################################
  
  private class Counter implements Comparable {
    
    public Counter(String word) {
      this.word = word;
      this.value = 0;
      this.cleanedWord = cleanWord(word);
      createRootOfWord(cleanedWord);
    }
    
    public String getWord() {
      return word;
    }
    
    public String getRootOfWord() {
      return rootOfWord;
    }

    public int getTotalValue() {
      if (totalValue == -1) {
        this.totalValue = value;
        if (lstMergedCounter.isEmpty())
          return totalValue;
        Iterator iter = lstMergedCounter.iterator();
        while (iter.hasNext()) {
          Counter merged = (Counter) iter.next();
          this.totalValue += merged.getTotalValue();
        }
      }
      return totalValue;
    }

    public void increaseValue() {
      this.value++;
    }
    
    public String toString() {
      return createString("\t");
    }

    /**
     * @param other is the previous Counter in List sorted in alphabetical order
     */
    public boolean tryToMergeIn(Counter other) {
      if (! rootOfWord.equals(other.getRootOfWord()))
        return false;
      other.lstMergedCounter.add(this);
      return true;
    }
    
    // - Comparator
    
    public int compareTo(Object o) {
      Counter other = (Counter) o;
      if (bSortOnWord) {
        int diff = rootOfWord.compareTo(other.rootOfWord);
        if (diff != 0)
          return diff;
        return cleanedWord.compareTo(other.cleanedWord);
      }
      int diff = other.getTotalValue() - getTotalValue();
      if (diff != 0)
        return diff;
      return cleanedWord.compareTo(other.cleanedWord);
    }
    
    private String createString(String separator) {
      StringBuffer sb = new StringBuffer();
      sb.append(getTotalValue());
      sb.append(separator);
      sb.append(getWord());
//sb.append(" [" + rootOfWord + "]");

      if (lstMergedCounter.isEmpty())
        return sb.toString();
      // - merged counters
      sb.append(" (dont ");
      Iterator iter = lstMergedCounter.iterator();
      while (iter.hasNext()) {
        Counter merged = (Counter) iter.next();
        sb.append(merged.createString(" "));
        sb.append(", ");
      }
      sb = sb.replace(sb.length() - 2, sb.length(), ")");
      return sb.toString();
    }
    
    private void createRootOfWord(String baseWord) {
      // - except
      if ("ile".equals(baseWord) || "iles".equals(baseWord)) {
        this.rootOfWord = "ile";
        return;
      }
      if ("pays".equals(baseWord)) {
        this.rootOfWord = baseWord;
        return;
      }
      if ("ete".equals(baseWord)) {
        this.rootOfWord = baseWord;
        return;
      }
      // - compute root
      boolean checkDouble = false;
      if (baseWord.endsWith("es")) {
        this.rootOfWord = baseWord.substring(0, baseWord.length() - 2);
        checkDouble = true;
      }
      else if (baseWord.endsWith("e")) {
        this.rootOfWord = baseWord.substring(0, baseWord.length() - 1);
        checkDouble = true;
      }
      else if (baseWord.endsWith("s"))
        this.rootOfWord = baseWord.substring(0, baseWord.length() - 1);
      else
        this.rootOfWord = baseWord;
      int rootLength = rootOfWord.length();
      if (checkDouble && rootLength > 2) {
        char c1 = rootOfWord.charAt(rootLength - 2);
        char c2 = rootOfWord.charAt(rootLength - 1);
        if (c1 == c2)
          this.rootOfWord = rootOfWord.substring(0, rootLength - 1);
      }
    }
    
    private String word, cleanedWord, rootOfWord;
    private int value, totalValue = -1;
    private List lstMergedCounter = new ArrayList();
  }
  
  // ############################################################################
  // ########################## WordCounterException ############################
  // ############################################################################
  
  private static class WordCounterException extends RuntimeException {
    
    public WordCounterException(String msg) {
      super(msg);
    }
    
    public WordCounterException(Throwable cause) {
      super(cause);
    }
    
    public WordCounterException(String msg, Throwable cause) {
      super(msg, cause);
    }
  }
}
