import java.util.*;
import java.io.*;
import org.jsoup.Jsoup;

class Productthemes extends Storyparse {

static String socialmediaposts;

static String phrs[][] = new String[MAXN][10];
static String bestphrs[][] = new String[MAXN][10];
static String checkword[] = new String[600];

static int phrslngth[] = new int[MAXN];
static int clussol[][] = new int[MAXCLS][MAXN];
static int clussize[] = new int[MAXCLS];
static int bestclussol[][] = new int[MAXCLS][MAXN];
static int bestclussize[] = new int[MAXCLS];

// ------------------------------------------------------------------------

static void themeclusters_() {

/* Finds clusters of themes associated with a given product.
   One way to get rid of not-useful phrases is to only use phrases that
   occur at least once in the corpus. */

String prntstrng = "none", line;

int i, i1, j, m, k, l, nmphrses = 0, bestmaxk = 0, bestm = 0, bestk = 0,
   maxk = 0, objnm = 0, nmclosest = 0, nmwrds = 0, nmcheckwords = 0,
   nmfound = 0, moffset = 0;

double db = 0., mindb = 1.e6;

// Open social media posts file.

fleopen_(4, socialmediaposts, 'r');

// Read-in text of social media posts.

soupstring = " ";
do {
   line = fgetline_(4);
   soupstring += " " + line;
} while (line != null);
fclose_(4, 'r');

articletext = Jsoup.parse(soupstring).text();

// Read story into "sentence" array.

Storyutils.readstory_(4, 0);

printf_("themeclusters: nmtextsentences= " + nmtextsentences);

// Remove non-useful words.

for (i = 0; i < nmtextsentences; ++i) {
   nmwrds = nmtextsenwrds[i];
   j = 0;
   do {
      if (textsentence[i][j].equals("a") ||
          textsentence[i][j].equals("an") ||
          textsentence[i][j].equals("I") ||
          textsentence[i][j].equals("I'm") ||
          textsentence[i][j].equals("I've") ||
          textsentence[i][j].equals("the")) {
         for (k = j; k < nmwrds - 1; ++k) {
            textsentence[i][k] = textsentence[i][k + 1];
	 }
         --nmwrds;
	 --j;
      }
      ++j;
   } while (j < nmwrds && nmwrds > 1);
   nmtextsenwrds[i] = nmwrds;
}

// Read-in check words.

fleopen_(5, "checkwords.dat", 'r');
do {
   checkword[nmcheckwords] = fgetstrng_(5);
   fgetint_(5);
   fgetstrng_(5);
   ++nmcheckwords;
} while (!checkeof_(5));
fclose_(5, 'r');

/* Form m-grams. Note to me: for ginseng1.dat, m=7 and the same offset
   for each sentence gives a reasonable solution. */

m = 7;
for (i1 = 0; i1 < 100; ++i1) {
   nmphrses = 0;
   for (l = 0; l < nmtextsentences; ++l) {

      // Generate a random offset for this sentence.

      moffset = (int) (((double) m) * Rndm.rndm1_(0, 0));

      // Slide an m-word long window across a sentence to form m-grams.

      for (i = moffset; i < nmtextsenwrds[l]; i += m) {
         if (i + m - 1 >= nmtextsenwrds[l]) {
            break;
         }
         phrs[nmphrses][0] = Textutils.replacenumeric_(textsentence[l][i]);
         for (j = 1; j < m; ++j) {
            phrs[nmphrses][j] = Textutils.replacenumeric_(
			           textsentence[l][i + j]);
         }
	 phrslngth[nmphrses] = m;
   
	 // Remove phrases that do not contain an adjective/noun/verb.

	 nmfound = 0;
         for (j = 0; j < nmcheckwords; ++j) {
            for (k = 0; k < phrslngth[nmphrses]; ++k) {
               if (phrs[nmphrses][k].equals(checkword[j])) {
                  ++nmfound;
	       }
            }
         }

	 if ((m <= 2 && nmfound > 0) || (m > 2 && nmfound > 0)) {
	    ++nmphrses;
	 }

	 if (nmphrses == MAXN) {
            iderr_("themeclusters: nmphrses= " + nmphrses);
	 }
      }
   }

   if (nmphrses < 6) {
      printf_("themeclusters: m= " + m + " nmphrses= " + nmphrses + " skipping");
      continue;
   }

   maxk = Math.min(10, nmphrses / 2);
   // printf_("\nthemeclusters: nmphrses= " + nmphrses + " maxk= " + maxk);

   // Form phrase clusters using all combinations of m and k.

   for (k = 3; k <= maxk; ++k) {
      db = Kmedcls.kmedphrases_(nmphrses, phrs, phrslngth, k, clussol,
              clussize);

      /* Print this clustering solution.
      printf_("themeclusters: maxk= " + maxk + " m= " + m + " k= " + k +
         " db= " + db);
      for (i = 0; i < k; ++i) {
         printf_("cluster " + (i + 1) + " size= " + clussize[i]);
         for (j = 0; j < clussize[i]; ++j) {
            objnm = clussol[i][j] - 1;
            prntstrng = phrs[objnm][0];
            for (l = 1; l < m; ++l) {
               prntstrng += " " + phrs[objnm][l];
            }
            printf_("phrase= " + prntstrng);
         }
      }
      */

      if (db < mindb) {
         mindb = db;
	 bestm = m;
	 bestk = k;
	 bestmaxk = maxk;
         for (i = 0; i < k; ++i) {
            bestclussize[i] = clussize[i];
            for (j = 0; j < clussize[i]; ++j) {
               bestclussol[i][j] = clussol[i][j];
            }
         }
	 for (i = 0; i < nmphrses; ++i) {
	    for (j = 0; j < m; ++j) {
               bestphrs[i][j] = phrs[i][j];
	    }
	 }
      }
   }
}
printf_("themeclusters: bestmaxk= " + bestmaxk + " bestm= " + bestm +
   " bestk= " + bestk + " mindb= " + mindb);

/* Print out each cluster's medoid and the first l phrases in that cluster
   that are closest to it where l is the 10% quantile in terms of closesness.
   The procedure assumes the solution has been sorted this way.  For
   readability, l is restricted to the interval 1 to 5. */

for (i = 1; i <= bestk; ++i) {
   nmclosest = (int) Math.round(.3 * ((double) bestclussize[i - 1]));
   if (nmclosest < 2) {
      nmclosest = 2;
   }
   if (nmclosest > Math.min(4, bestclussize[i - 1])) {
      nmclosest = Math.min(4, bestclussize[i - 1]);
   }
   nmclosest = bestclussize[i - 1];
   printf_("\nCluster= " + i + " size= " + bestclussize[i - 1] +
      " nmclosest= " + nmclosest);
   for (j = 0; j < nmclosest; ++j) {
      objnm = bestclussol[i - 1][j] - 1;
      prntstrng = bestphrs[objnm][0];
      for (k = 1; k < bestm; ++k) {
         prntstrng += " " + bestphrs[objnm][k];
      }
      printf_("phrase= " + prntstrng);
   }
}
}
}
