//PRET-Extractor
//Copyright (c) 2013 Tetsuya Kanda
//
//http://sel.ist.osaka-u.ac.jp/pret/
//
//Permission is hereby granted, free of charge, to any person obtaining
//a copy of this software and associated documentation files (the
//"Software"), to deal in the Software without restriction, including
//without limitation the rights to use, copy, modify, merge, publish,
//distribute, sublicense, and/or sell copies of the Software, and to
//permit persons to whom the Software is furnished to do so, subject to
//the following conditions:
//
//The above copyright notice and this permission notice shall be
//included in all copies or substantial portions of the Software.
//
//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
//EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
//MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
//NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
//LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
//OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
//WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

package jp.ac.osaka_u.ist.sel.pret.engine.similarity;

import gnu.trove.iterator.TIntIterator;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.TIntIntMap;
import gnu.trove.map.TObjectIntMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;

import java.io.Serializable;
import java.nio.file.Paths;
import java.util.Collection;
import java.util.List;

import jp.ac.osaka_u.ist.sel.pret.engine.data.FileInfo;
import jp.ac.osaka_u.ist.sel.pret.engine.diff.TKJavaDiff;

/**
 * get term frequency. we can get maximum similarity between files.
 * 
 * @author t-kanda
 * 
 */
public class WordReduce implements Serializable {

	private static final long serialVersionUID = 1323560663916209548L;
	private TObjectIntMap<String> words;
	public int nextWID = Integer.MIN_VALUE;

	public WordReduce() {
		words = new TObjectIntHashMap<>();
	}

	public void setAll(Collection<FileInfo> fis) {
		for (FileInfo fi : fis) {
			set(fi);
		}
	}

	/**
	 * record word-freq map. Now you can use fi.getFreq().
	 * 
	 * @param fi
	 *            target file information
	 */
	public void set(FileInfo fi) {
		TIntIntMap ss = new TIntIntHashMap();

		for (String w : TKJavaDiff.listLines(Paths.get(fi.preSim))) {
			int id;
			if (words.containsKey(w)) {
				id = words.get(w);
			} else {
				words.put(w, nextWID);
				id = nextWID;
				nextWID++;
			}

			if (!ss.increment(id)) {
				ss.put(id, 1);
			}
		}
		fi.setFreq(ss);
	}

	/**
	 * get maximum similarity between files
	 * 
	 * @param fi1
	 * @param fi2
	 * @return maximum similarity
	 */
	public float maxSim(FileInfo fi1, FileInfo fi2) {
		TIntIntMap tf1 = fi1.getFreq();
		TIntIntMap tf2 = fi2.getFreq();
		int a = 0, b = 0, ab = 0;

		// list up terms
		TIntSet targetWIDs = new TIntHashSet(tf1.keySet());
		targetWIDs.addAll(tf2.keySet());

		// count
		for (TIntIterator it = targetWIDs.iterator(); it.hasNext();) {
			int wid = it.next();
			int tfa = tf1.get(wid);
			int tfb = tf2.get(wid);
			a += tfa;
			b += tfb;
			ab += Math.min(tfa, tfb);
		}

		return (float) ab / (a + b - ab);
	}

	public TIntList toIntList(List<String> src) {
		TIntList result = new TIntArrayList();
		for (String w : src) {
			result.add(words.get(w));
		}
		return result;
	}

	public int[] toIntArray(List<String> src) {
		int[] result = new int[src.size()];
		for (int i = 0; i < src.size(); i++) {
			result[i] = (words.get(src.get(i)));
		}
		return result;
	}
}
