import java.util.*;
class Textutils extends Groupdata {

static String binword[] = new String[BIGNMWRDBINS];

static double wordfreq[] = new double[BIGNMWRDBINS];

static int nmwordbins;

// ---------------------------------------------------------------------

static double similar_(String phrase1[], int ph1nmwrds,
   String phrase2[], int ph2nmwrds) {

/* Measures the similarity between "phrase1" and "phrase2 using a
   modified version of the "Word Simple N-gram Overlap" measure of

   Cordeiro, J., Dias, G., Brazdil, P. 2007, A Metric for paraphrase
   detection. Proceedings of the International Multi-Conference on
   Computing in the Global Information Technology, IEEE Computer Society.
   Available at http://ieeexplore.ieee.org/document/4137062/
*/

String ph1igram = "none", ph2igram = "none";

int i, j, k, l, c1, c2, cs, capn, nmsharedphrases = 0, nmtrys = 0;

double ri = 0., simval = 0., phrasesim = 0., simvalsum = 0., overlap = 0.,
   phlngthratio = 0.;

if (ph1nmwrds < 1 || ph2nmwrds < 1) {
   iderr_("similar: ph1nmwrds= " + ph1nmwrds + " ph2nmwrds= " + ph2nmwrds);
}

if (ph1nmwrds == 1 && ph2nmwrds == 1) {
   return (levenshtein_(phrase1[0], phrase2[0]));
}

/* Test:
ph1igram = phrase1[0];
for (l = 1; l < ph1nmwrds; ++l) {
   ph1igram += "_" + phrase1[l];
}
ph2igram = phrase1[0];
for (l = 1; l < ph2nmwrds; ++l) {
   ph2igram += "_" + phrase2[l];
}
return (levenshtein_(ph1igram, ph2igram));
*/

// Return length ratio for phrases of very different lengths.

phlngthratio = ((double) ph1nmwrds) / ((double) ph2nmwrds);
if (ph1nmwrds > ph2nmwrds) {
   phlngthratio = 1. / phlngthratio;
}
if (phlngthratio < .8) {
   return (phlngthratio);
}

capn = Math.min(ph1nmwrds, ph2nmwrds);

if (capn < 1) {
   iderr_("similar: capn= " + capn);
}

for (i = 1; i <= capn; ++i) {
   ri = i;

   // Compute the number of i-grams in each phrase.

   c1 = ph1nmwrds - i + 1;
   c2 = ph2nmwrds - i + 1;
   cs = Math.min(c1, c2);

   // Count the number of matching i-grams.

   simvalsum = 0.;
   nmsharedphrases = 0;
   for (j = 0; j < c1; ++j) {
      for (k = 0; k < c2; ++k) {

         /* Construct both i-grams.  Two nested for loops are needed here
	    in order to start each i-gram at each possible starting point
            within its phrase. */

         ph1igram = phrase1[j];
         ph2igram = phrase2[k];
         for (l = 1; l < i; ++l) {
            ph1igram += "_" + phrase1[j + l];
	    ph2igram += "_" + phrase2[k + l];
         }

	 // Compute the Levenshtein similarity of these two i-grams.

	 simval = levenshtein_(ph1igram, ph2igram);
	 simvalsum += simval;

	 // Count the number of i-grams shared between the two phrases.

	 if (simval > .99) {
            ++nmsharedphrases;
	 }

	 // Count the number of trys.

	 if (j == k) {
	    ++nmtrys;
	 }
      }
   }
   phrasesim += simvalsum / ((double) cs);
   overlap += (double) (nmsharedphrases * Math.pow(ri, 2.0));
}

/* Cordeiro et al. measure:
phrasesim /= (double) capn;
if (phrasesim >= 1.) {
   phrasesim = 1.;
}
*/

// Ponzetto and Strube measure:

phrasesim = Math.tanh(overlap / ((double) nmtrys));

// Penalty for phrase-length difference.

phrasesim *= phlngthratio;

/*
ph1igram = phrase1[0];
for (l = 1; l < ph1nmwrds; ++l) {
   ph1igram += "_" + phrase1[l];
}
ph2igram = phrase2[0];
for (l = 1; l < ph2nmwrds; ++l) {
   ph2igram += "_" + phrase2[l];
}
printf_("similar: ph1igram= " + ph1igram + " ph2igram= " + ph2igram +
   " phrasesim= " + phrasesim);
*/

if (phrasesim < 0. || phrasesim > 1.) {
   iderr_("similar: phrasesim= " + phrasesim);
}

return phrasesim;
}

// ---------------------------------------------------------------------

static double levenshtein_(String source, String target) {

/* Computes a measure of string similarity called the Levenshtein
   Distance -- also known as the Edit Distance.  This implementation is
   a modified version of the code by Chas Emerick, see
   www.merriampark.com/ldjava.htm.

   i, j: indexes into strings s and t
   t_j:  jth character of t
   cost: cost
   _d[]; placeholder to assist in swapping p and d.

   Routine computes a measure of similarity equal to 1 - cost / max length.
*/

char t_j;

String dumstrng, s, t;

int i, j, cost, _d[];

double retval, maxlngth;

if (source == null || target == null) {
   /*
   printf_("similar: source= " + source + " target= " + target);
   printf_("similar: null string(s)");
   */

   return 0.;
}

// Trim front and back whitespace and convert to lowercase.

dumstrng = source;
dumstrng = dumstrng.toLowerCase();
s = dumstrng.trim();

dumstrng = target;
dumstrng = dumstrng.toLowerCase();
t = dumstrng.trim();

/* Remove integers?? and various irrelevant strings so that the
   similarity measure is increased. */

/*
for (i = 0; i < 10; ++i) {
   s = strngsub_(s, Integer.toString(i), "");
   t = strngsub_(t, Integer.toString(i), "");
}
*/

s = strngsub_(s, "lbs_of_", "");
t = strngsub_(t, "lbs_of_", "");
s = strngsub_(s, "to_be_", "");
t = strngsub_(t, "to_be_", "");
s = strngsub_(s, "mountain_bongo_", "");
t = strngsub_(t, "mountain_bongo_", "");
s = strngsub_(s, "northern_", "");
t = strngsub_(t, "northern_", "");
s = strngsub_(s, "one,", "");
t = strngsub_(t, "one,", "");
s = strngsub_(s, "two,", "");
t = strngsub_(t, "two,", "");
s = strngsub_(s, "three,", "");
t = strngsub_(t, "three,", "");
s = strngsub_(s, "km2_", "");
t = strngsub_(t, "km2_", "");

// Find lengths.

int n = s.length();
int m = t.length();
if (n == 0 || m == 0) {
   return 0.;
}

// Find maximum string length.

maxlngth = Math.max(m, n);

int p[] = new int[n + 1]; // 'Previous' cost array, horizontally.
int d[] = new int[n + 1]; // Cost array, horizontally.

for (i = 0; i <= n; ++i) {
   p[i] = i;
}

for (j = 1; j <= m; j++) {
   t_j = t.charAt(j - 1);
   d[0] = j;

   for (i = 1; i <= n; ++i) {
      cost = s.charAt(i - 1) == t_j ? 0 : 1;

      /* Minimum of cell to the left + 1, to the top + 1, diagonally
	 left and up + cost. */

      d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
   }

   // Copy current distance counts to 'previous row' distance counts.

   _d = p;
   p = d;
   d = _d;
}

/* Our last action in the above loop was to switch d and p, so p now
   actually has the most recent cost counts. */

retval = (double) p[n];

return 1. - retval / maxlngth;
}

// ---------------------------------------------------------------------

static double jaccardsim_(int maxnmelements, String charvec1[],
   String charvec2[]) {

// Computes the Jaccard similarity measure.

int i, j, nmcommon = 0, ttlnmelements = 0;

double retval = 0.;

for (i = 0; i < maxnmelements; ++i) {
   for (j = 0; j < maxnmelements; j++) {
      if (charvec1[i] != null && charvec2[j] != null) {
         if (charvec1[i].equals(charvec2[j])) {
            ++nmcommon;
         }
      }
   }
}

for (i = 0; i < maxnmelements; ++i) {
   if (charvec1[i] != null) {
      ++ttlnmelements;
   }
}

for (i = 0; i < maxnmelements; ++i) {
   if (charvec2[i] != null) {
      ++ttlnmelements;
   }
}

if (ttlnmelements > nmcommon) {
   retval = ((double) nmcommon) / ((double) (ttlnmelements - nmcommon));
}

return retval;
}

// ---------------------------------------------------------------------

static double nmdays_(int actionid, int year, int month, int day) {

// Returns the number of days up to the given month and day.

int i, retval = day;

if (month > 12) {
   iderr_("nmdays: actionid= " + actionid + " month= " + month + " is > 12");
}

for (i = 0; i < month - 1; ++i) {
   retval += monthdays[i];
}

// Leap years.

if (year == 4 || year == 14 || year == 16) {
   ++retval;
}

return (double) retval;
}

// ----------------------------------------------------------------------

static void wordfreqs_(String word, boolean createnew) {

// Creates a word freqency array.

int i;

// Ignore common words.

if (word.equals("and") ||
    word.equals("a") ||
    word.equals("of") ||
    word.equals("the") ||
    word.equals("to") ||
    word.equals("is") ||
    word.equals("in") ||
    word.equals("0") ||
    word.equals("=") ||
    word.equals("Answer:") ||
    word.equals("be") ||
    word.equals("that") ||
    word.equals("are") ||
    word.equals("as") ||
    word.equals("The")) {
   return;
}

// Find word bin.

for (i = 0; i < nmwordbins; ++i) {
   if (levenshtein_(word, binword[i]) > .95) {
      wordfreq[i] += 1.;
      return;
   }
}

if (!createnew) {
   return;
}

// Create a new word bin.

binword[nmwordbins] = word;
wordfreq[nmwordbins] = 1.;
++nmwordbins;

if (nmwordbins == BIGNMWRDBINS) {
   iderr_("wordfreqs: nmwordbins = BIGNMWRDBINS");
}
return;
}

// -----------------------------------------------------------------

static String replacenumeric_(String str) {

// Checks if word is a number.

if (str.matches("-?\\d+(\\.\\d+)?")) {
   return "some";

} else {
   return str;
}
}

// -----------------------------------------------------------------

static int conjugate_(String verbtarget, String conjugform[]) {

/* Detects a regular verb, conjugates it, and returns the number of
   conjugated forms. */

String lasttwochars;

int targetlength;

/* Detect a regular verb by checking for its past-tense form. */

targetlength = verbtarget.length();
lasttwochars = verbtarget.substring(targetlength - 2);

if (!lasttwochars.equals("ed")) {
   return 0;
}

/* Compute and store conjugation: infinitive, .ing, .plural. */

conjugform[0] = verbtarget.substring(0, targetlength - 2);
conjugform[1] = conjugform[0] + "ing";
conjugform[2] = conjugform[0] + "s";

return 3;
}
}
