public class PlagDetect extends MOC {

static String docfle = "none";

static String docword[][][] = new String[NMSTDNTS][NMDOCS][NMWRDS];
static String dumstrngvec[] = new String[NMWRDBINS];

static int p = 10;

static int n[] = new int[NMSTDNTS];
static int nmdocwords[][] = new int[NMSTDNTS][NMDOCS];
static int ipvt[] = new int[NMWRDBINS];
static int nmclstr[] = new int[NMCLSTRS];
static int index[] = new int[BIGNMWRDBINS];
static int clstrmbrshp[] = new int[NMDOCS];
static int clstrbest[] = new int[NMDOCS];
static int mbrshpbest[] = new int[NMDOCS];
static int chkarray1[][] = new int[NMDOCS][2];
static int chkarray2[][] = new int[NMDOCS][2];
static int chkarray3[][] = new int[NMDOCS][2];

static double dumvec1[] = new double[NMWRDBINS];
static double dumvec2[] = new double[NMWRDBINS];
static double xmean[][] = new double[NMCLSTRS][NMWRDBINS];
static double w[][] = new double[NMWRDBINS][NMWRDBINS];
static double wdecomp[][] = new double[NMWRDBINS][NMWRDBINS];
static double msdinv[][] = new double[NMWRDBINS][NMWRDBINS];
static double wold[][] = new double[NMWRDBINS][NMWRDBINS];
static double wbest[][] = new double[NMWRDBINS][NMWRDBINS];
static double z[][] = new double[NMWRDBINS][NMWRDBINS];
static double zplusb[][] = new double[NMWRDBINS][NMWRDBINS];
static double dummat[][] = new double[NMWRDBINS][NMWRDBINS];
static double dummatt[][] = new double[NMWRDBINS][NMWRDBINS];
static double chkmat[][] = new double[NMWRDBINS][NMWRDBINS];
static double d[][] = new double[NMDOCS][NMDOCS];
static double corpus[][][] = new double[NMSTDNTS][NMDOCS][NMWRDBINS];
static double betweencent[] = new double[NMDOCS];

// --------------------------------------------------------------------

public static void clstrsep_() {

/* Compute the cluster separateness score for each set of documents
   (a "corpus") in the data file, "docfle." */

int i, j, k, nmstdnts;

double score = 0.;

/* Read student corpora file and create multivariate word function
   observations. */

nmstdnts = createfreqdat_(docfle);

/* Find author clusters in each student's corpus and compute the measure
   of cluster separateness. */

for (i = 0; i < nmstdnts; ++i) {
   score = aceclus_(p, n[i], corpus[i]);
   printf_("clstrsep: stdnt= " + (i + 1) + " sep-score= " + score);
}
iderr_("in clstrsep");
}

// --------------------------------------------------------------------

public static double aceclus_(int p, int n, double x[][]) {

/* Simultaneously forms clusters and estimates the number of clusters
   using the ACE algorithm of Art et al. (1982). */

boolean badw = false, allzero = false, singleton = false,
   approxscore = false;

int i, j, h, k, uloop, aceloop, nmusteps = 10, clstrfound, q, qbest = 0;

double u, dist = 0., nmrtr = 0., dnmntr = 0., smallval = .1, rn, rp,
   rq, distmin, distmax, maxsepscore = 0., lngth, maxeig, sepscore,
   cond, uincrmnt = 0., maxvar = 0.;

rn = (double) n;
rp = (double) p;

// Compute the grand mean vector.

for (i = 0; i < p; ++i) {
   xmean[0][i] = 0.;
   for (j = 0; j < n; ++j) {
      xmean[0][i] += x[j][i];
   }
   xmean[0][i] /= (double) n;
}

/* Initialize the within-cluster covariance matrix to be the
   identity matrix. */

Matrix.iden_(w, p);
         
// Decompose this updated within-cluster covariance matrix.

Matrix.mcpy_(wdecomp, w, p, p);
cond = Eqslv.decomp_(p, wdecomp, ipvt, dumvec1);
if (1.e10 < cond) {
   iderr_("aceclus: big condition number 0");
}

/* Find the smallest and largest distance that separate members in
   an observation pair. */

distmin = 1.e6;
distmax = 0.;
for (i = 1; i < n; ++i) {
   for (h = 0; h < i; ++h) {
      dist = Dkernl.dist_(p, p, x[i], x[h], false, msdinv, wdecomp, ipvt);
      if (dist < distmin) {
         distmin = dist;
      }
      if (distmax < dist) {
         distmax = dist;
      }
   }
}

// u-value loop.

maxsepscore = 0.;
uincrmnt = (distmax - distmin) / ((double) (nmusteps + 1));
ULOOP: for (uloop = 1; uloop <= nmusteps / 2; ++uloop) {
   u = distmin + ((double) uloop) * uincrmnt;
   printf_("aceclus: ------------- uloop= " + uloop + " u= " + u + " ----");

   // ACE iternations.

      for (aceloop = 0; aceloop < 10; ++aceloop) {

      /* Compute the Mahalanobis distance between members of each
         observation pair.  Then, determine if the pair is in the same
         cluster or not. */

      printf_("\naceclus: aceloop= " + aceloop + " u= " + u + " w=");

      distmin = 1.e6;
      distmax = 0.;
       for (i = 0; i < n; ++i) {
         clstrmbrshp[i] = 0;
         for (h = 0; h < i; ++h) {
            dist = Dkernl.dist_(p, p, x[i], x[h], false, msdinv, wdecomp,
			        ipvt);
            if (dist < u) {
               d[i][h] = 1.;
         
            } else {
               d[i][h] = 0.;
            }
            d[h][i] = d[i][h];
	 
            // Update "distmin" and "distmax" values.

            if (dist < distmin) {
               distmin = dist;
            }
            if (distmax < dist) {
               distmax = dist;
            }
         }
      }

      // Update the within-cluster covariance matrix.

      maxvar = 0.;
      for (j = 0; j < p; ++j) {
         for (k = 0; k <= j; ++k) {
            nmrtr = 0.;
            dnmntr = 0.;
            for (i = 1; i < n; ++i) {
               for (h = 0; h < i; ++h) {
                  nmrtr += d[i][h] * (x[i][j] - x[h][j]) *
                           (x[i][k] - x[h][k]);
                  dnmntr += d[i][h];
               }
            }
            w[j][k] = nmrtr / (2. * dnmntr);
            w[k][j] = w[j][k];
         }
      }
      for (j = 0; j < p; ++j) {
         if (maxvar < Math.abs(w[j][j])) {
            maxvar = Math.abs(w[j][j]);
	 }
      }
      
      // Decompose this updated within-cluster covariance matrix.

      Matrix.mcpy_(wdecomp, w, p, p);
      if (1.e10 < Eqslv.decomp_(p, wdecomp, ipvt, dumvec1)) {
         for (i = 0; i < p; ++i) {
            w[i][i] = 1.1 * Math.abs(maxvar);
	 }
         Matrix.mcpy_(wdecomp, w, p, p);
         if (1.e10 < Eqslv.decomp_(p, wdecomp, ipvt, dumvec1)) {
            iderr_("aceclus: second try failed");
	 }
      }

      /* Check for convergence by computing the Spectral Norm between
         the current and old within-cluster covariance matrices. */

      if (aceloop > 0) {
         allzero = true;
         for (j = 0; j < p; ++j) {
            for (k = 0; k < p; ++k) {
               dummat[j][k] = w[j][k] - wold[j][k];
               if (Math.abs(dummat[j][k]) > 1.e-6) {
                  allzero = false;
	       }
            }
         }
         if (allzero) {
            printf_("aceclus: allzero=true");
            break;
         }
         Matrix.trsp_(dummat, dummatt, p, p);
	 Matrix.mxm_(dummatt, dummat, chkmat, p, p, p);

         // Estimate the largest eigenvalue with the Power method.

         lngth = 0.;
         for (i = 0; i < p; ++i) {
            // dumvec1[i] = Rndm.rndm1_(0, 0);
            dumvec1[i] = 1.;
            lngth += dumvec1[i] * dumvec1[i];
         }
         lngth = Math.sqrt(lngth);
         for (i = 0; i < p; ++i) {
            dumvec1[i] /= lngth;
         }
         maxeig = Eigenvec.powerm_(chkmat, dumvec1, 1.e-4, p, 20);

         // Check for convergence.

         if (Math.sqrt(maxeig) < smallval) {
	    printf_("aceclus: aceloop converged");
            break;
         }
      }

      // Compute a new u-increment value and a new u-value.

      if (aceloop == 0) {
         distmin = 1.e6;
         distmax = 0.;
         for (i = 0; i < n; ++i) {
            for (h = 0; h < i; ++h) {
               dist = Dkernl.dist_(p, p, x[i], x[h], false, msdinv,
			           wdecomp, ipvt);
               if (dist < distmin) {
                  distmin = dist;
               }
               if (distmax < dist) {
                  distmax = dist;
               }
            }
         }
         uincrmnt = (distmax - distmin) / ((double) (nmusteps + 1));
         u = distmin + ((double) uloop) * uincrmnt;
      }
      
      // Store this current within-cluster covariance matrix.

      Matrix.mcpy_(wold, w, p, p);

   } // End of ACE loop.

   // Estimate cluster membership.

   q = estclstrs_(n, d, clstrmbrshp);
   if (q < 2 || q > n / 2) {
      printf_("aceclus: unacceptable q= " + q);
      continue;
   }

   /* Compute the within, and between-plus-within sum of squares
      and cross-products matrices. */

   rq = (double) q;
   for (i = 0; i < p; ++i) {
      xmean[0][i] = 0.;
      for (j = 0; j < p; ++j) {
         z[i][j] = 0.;
         zplusb[i][j] = 0.;
      }
   }
   for (i = 1; i <= q; ++i) {
      for (j = 0; j < p; ++j) {
         xmean[i][j] = 0.;
      }
      nmclstr[i - 1] = 0;
      for (j = 0; j < n; ++j) {
         for (k = 0; k < p; ++k) {
            xmean[0][k] += x[j][k];
         }
         if (clstrmbrshp[j] == i) {
            for (k = 0; k < p; ++k) {
               xmean[i][k] += x[j][k];
            }
            ++nmclstr[i - 1];
         }
      }
      for (k = 0; k < p; ++k) {
         xmean[i][k] /= (double) nmclstr[i - 1];
      }
   }
   for (i = 0; i < p; ++i) {
      xmean[0][i] /= (double) n;
   }

   // "dumvec1" updates z, "dumvec2" updates zplusb.
   
   for (i = 1; i <= q; ++i) {
      for (j = 0; j < n; ++j) {
         if (clstrmbrshp[j] == i) {
            for (k = 0; k < p; ++k) {
               dumvec1[k] = x[j][k] - xmean[i][k];
               dumvec2[k] = x[j][k] - xmean[0][k];
            }
            Matrix.vxvt_(dumvec1, dummat, p);
            Matrix.mupdt_(z, dummat, p, p);

            Matrix.vxvt_(dumvec2, dummat, p);
            Matrix.mupdt_(zplusb, dummat, p, p);
         }
      }
   }

   // Compute cluster separateness score.

   sepscore = Eigenvec.det_(zplusb, p);
   sepscore = Eigenvec.det_(z, p) / sepscore;
   if (1.e-6 < sepscore) {
      sepscore = -(rn - 1. - (rp + rq) / 2.) * Math.log(sepscore);

   } else {
      sepscore = 0.;
   }
   printf_("aceclus: q= " + q + " sepscore= " + sepscore);

   /* Store the "w" matrix of the solution having the largest
      sepscore value. */

   if (maxsepscore < sepscore) {
      maxsepscore = sepscore;
      qbest = q;
      for (i = 0; i < n; ++i) {
         mbrshpbest[i] = clstrmbrshp[i];
      }
      for (i = 0; i < p; ++i) {
         for (j = 0; j < p; ++j) {
            wbest[i][j] = w[i][j];
         }
      }
   }
} // End of u-value loop.

printf_("aceclus: qbest= " + qbest);
return (maxsepscore);
}

// ---------------------------------------------------------------------

public static int estclstrs_(int n, double d[][], int clstrmbrshp[]) {

/* Estimates cluster membership by removing links between observations
   that have high betweenness scores. */

int i, j, k, l, betobs1, betobs2, betobs3, nmchk1 = 0, nmchk2 = 0,
   nmchk3 = 0, rowobs1 = 0, rowobs2 = 0, rowobs3 = 0, colobs1 = 0,
   colobs2 = 0, colobs3 = 0, q = 0, qbest = 0;

/* Find the three observations with the largest betweenness centrality
   scores. */

SNA.betweencent_(n, d, betweencent);

// Sort observations in descending order.

for (i = 0; i < n; ++i) {
   index[i] = i + 1;
   betweencent[i] *= -1.;
}
Idsort.idsort_(betweencent, index, 1, n);
betobs1 = index[0];
betobs2 = index[1];
betobs3 = index[2];

/* Form arrays of all the links associated with each of these three
   observations. */

for (i = 1; i < n; ++i) {
   for (j = i + 1; j <= n; ++j) {
      if (d[i - 1][j - 1] == 1. && (i == betobs1 || j == betobs1)) {
         chkarray1[nmchk1][0] = i;
	 chkarray1[nmchk1][1] = j;
	 ++nmchk1;
      }
      if (d[i - 1][j - 1] == 1. && (i == betobs2 || j == betobs2)) {
         chkarray2[nmchk2][0] = i;
	 chkarray2[nmchk2][1] = j;
	 ++nmchk2;
      }
      if (d[i - 1][j - 1] == 1. && (i == betobs3 || j == betobs3)) {
         chkarray3[nmchk3][0] = i;
	 chkarray3[nmchk3][1] = j;
	 ++nmchk3;
      }
   }
}

for (i = 0; i < nmchk1; ++i) {
   rowobs1 = chkarray1[i][0] - 1;
   colobs1 = chkarray1[i][1] - 1;
   d[rowobs1][colobs1] = 0.;
   for (j = 0; j < nmchk2; ++j) {
      rowobs2 = chkarray1[i][0] - 1;
      colobs2 = chkarray1[i][1] - 1;
      d[rowobs2][colobs2] = 0.;
      for (k = 0; k < nmchk3; ++k) {
         rowobs3 = chkarray1[i][0] - 1;
         colobs3 = chkarray1[i][1] - 1;
         d[rowobs3][colobs3] = 0.;
         q = clstrmbrs_(n, d, clstrmbrshp);
         if (qbest < q) {
            qbest = q;
            for (l = 0; l < n; ++l) {
               clstrbest[l] = clstrmbrshp[l];
            }
         }
         d[rowobs3][colobs3] = 1.;
      }
      d[rowobs2][colobs2] = 1.;
   }
   d[rowobs1][colobs1] = 1.;
}

// Load this best cluster membership array and return.

for (i = 0; i < n; ++i) {
   clstrmbrshp[i] = clstrbest[i];
}
return qbest;
}

// ---------------------------------------------------------------------

public static int clstrmbrs_(int n, double d[][], int clstrmbrshp[]) {

// Discovers cluster membership and the number of clusters, q.

boolean singleton = false;

int i, j, k, q = 1, clstrfound = 0;

// First, form singleton clusters, if any. */

for (i = 0; i < n; ++i) {
   singleton = true;
   for (j = 0; j < n; ++j) {
      if (d[i][j] > 0.) {
         singleton = false;
         break;
      }
   }
   if (singleton) {
      clstrmbrshp[i] = q;
      ++q;
   }
}

// Now find clusters of size greater than 1.

do {

   // First, find the first observation not assigned to a cluster.
	 
   for (i = 0; i < n; ++i) {
      if (clstrmbrshp[i] == 0) {
         ++q;
         clstrmbrshp[i] = q;
         break;
      }
   }

   // Find all members of cluster "q."

   for (j = i; j < n - 1; ++j) {

      /* All "1" entries in the ith column of the lower triangle
         of the "d" matrix gives all observations connected to
         observation "j+1." */

      for (k = j + 1; k < n; ++k) {
         if (d[k][j] == 1. && clstrmbrshp[j] == q &&
             clstrmbrshp[k] == 0) {
            clstrmbrshp[k] = q;
         }
      } 
   }

   // Check if all observations have been assigned to clusters.

   clstrfound = 0;
   for (i = 0; i < n; ++i) {
      if (clstrmbrshp[i] > 0) {
         ++clstrfound;
      }
   }
   if (q > n) {
      iderr_("clstrmbrs: q > n");
   }
} while (clstrfound < n); 
return q;
}

// ---------------------------------------------------------------------

public static int createfreqdat_(String flename) {

/* Using the documents contained in the file "flename," create a list
   of the "p" most frequent words used across all students and all
   student documents.  Then, for each document in each student's corpus,
   create a multivariate observations (vector) of the frequencies of
   these words. */

int i, j, k, nmstdnts = 0, docnm, ttlnmwords = 0;

double freqval = 0.;

fleopen_(2, flename, 'r');

// Read the entire corpora and find the "p" most-frequent words.

for (i = 0; i < NMWRDBINS; ++i) {
   Textutils.wordfreq[i] = 0.;
}
Storyutils.createnew = true;

while (!checkeof_(2)) {
   n[nmstdnts] = 0;
   do {
      Storyutils.readstory_(2, n[nmstdnts]);
      nmdocwords[nmstdnts][n[nmstdnts]] = Storyutils.nmwords;
      ttlnmwords += Storyutils.nmwords;
      for (k = 0; k < nmdocwords[nmstdnts][n[nmstdnts]]; ++k) {
         docword[nmstdnts][n[nmstdnts]][k] = Storyutils.wordvec[k];
      }
      ++n[nmstdnts];
   } while (!Storyutils.endstudent);
   ++nmstdnts;
}
fclose_(2, 'r');

// Sort word frequencies and word bins in descending order.

for (i = 0; i < Textutils.nmwordbins; ++i) {
   index[i] = i + 1;
   Textutils.wordfreq[i] *= -1.;
}
Idsort.idsort_(Textutils.wordfreq, index, 1, Textutils.nmwordbins);

// Create new word bin vector.

for (i = 0; i < p; ++i) {
   dumstrngvec[i] = Textutils.binword[index[i] - 1];
   Textutils.wordfreq[i] /= -((double) ttlnmwords);
}
for (i = 0; i < p; ++i) {
   Textutils.binword[i] = dumstrngvec[i];
   printf_("createfreqdat: word= " + Textutils.binword[i] + " freq= " +
      Textutils.wordfreq[i]);
}

/* For each document written by each student, read the stored
   sentences and create its word-frequency observation vector
   using the class-wide common set of bin words. */

Textutils.nmwordbins = p;
for (i = 0; i < nmstdnts; ++i) {
   for (j = 0; j < n[i]; ++j) {
      for (k = 0; k < p; ++k) {
         Textutils.wordfreq[k] = 0.;
      }
      for (k = 0; k < nmdocwords[i][j]; ++k) {
         Textutils.wordfreqs_(docword[i][j][k], false);
      }
      for (k = 0; k < p; ++k) {
         freqval = Textutils.wordfreq[k] / (double) nmdocwords[i][j];
         if (freqval < 1.e-6) {
            freqval = .00001;
         }
         corpus[i][j][k] = Pltnrm.icump_(freqval);
      }
   }
}
return nmstdnts;
}
}
