public class Deduplicate extends SNA {
#
# Read id codes and associated non-secure attributes.
con <- file(description="qanalysis.dat", open="r")
nmrecs <- scan(file=con, nlines=1, quiet=T)
idcode <- paste("none",1:nmrecs,sep="")
nattbts <- matrix(0,nmrecs,2)
cattbts <- matrix("     ",nmrecs,4)
simval <- array(0,c(nmrecs,nmrecs,3))
tmp <- readLines(con, n=1)
for (i in 1:nmrecs) {
   tmp <- readLines(con, n=1)
   idcode[i] <- tmp[1]
   nattbts[i,1] <- as.numeric(tmp[2])
   nattbts[i,2] <- as.numeric(tmp[3])
   cattbts[i,1] <- tmp[4]
   cattbts[i,2] <- tmp[5]
}
# Read player-pair similarity measure values.
tmp <- readLines(con, n=1)
for (i in 1:(nmrecs - 1)) {
   for (j in (i + 1):nmrecs) {
      tmpraw <- readLines(con, n=1)
      tmp <- unlist(strsplit(tmpraw," +"))
      simval[i,j,1] <- as.numeric(tmp[3])
      simval[i,j,2] <- as.numeric(tmp[4])
      simval[i,j,3] <- as.numeric(tmp[5])
   }
}
close(con)
# Set the threshold value for declaring a duplicate record.
thrshld <- .9
# Compute the total similarity score and print out those player-pairs that
# are judged to be duplicates.
nmsims <- 3
for (i in 1:(nmrecs - 1)) {
   for (j in (i + 1):nmrecs) {
      ts <- 0
      for (k in 1:nmsims) {
         ts <- ts + simval[i,j,k];
      }
      ts <- ts / nmsims
      cat("i= ",i," j= ",j," ts= ",ts,"\n")
      if (ts > thrshld) {
         cat(idcode[i]," and ",idcode[j]," are judged to be duplicates.\n")
      }
   }
}

// ---------------------------------------------------------------------

# Define Jaccard similarity measure.
jaccardsim <- function(charvec1,charvec2) {
nmcommon <- 0
for (i in 1:2) {
   for (j in 1:2) {
      if (!is.na(charvec1[i]) & !is.na(charvec2[j])) {
         if (charvec1[i] == charvec2[j]) nmcommon <- nmcommon + 1;
      }
   }
}
ttlnmelements <- 0
for (i in 1:2) {
   if (!is.na(charvec1[i])) ttlnmelements <- ttlnmelements + 1;
}
for (i in 1:2) {
   if (!is.na(charvec2[i])) ttlnmelements <- ttlnmelements + 1;
}
return (nmcommon / (ttlnmelements - nmcommon));
}
#
qrslt <- read.table("qresult.dat",header=T, na.strings="NA")
#print(head(qrslt))
nmrecs <- nrow(qrslt)
#cat("nmrecs= ",nmrecs,"\n")
cat(nmrecs,"\n",file="qanalysis.dat")
cattbts <- matrix("     ",nmrecs,4)
cattbts[,1] <- as.vector(qrslt$name)
cattbts[,2] <- as.vector(qrslt$town)
cattbts[,3] <- as.vector(qrslt$contact1)
cattbts[,4] <- as.vector(qrslt$contact2)
#print(cattbts)
nattbts <- matrix(0,nmrecs,2)
nattbts[,1] <- as.vector(qrslt$nmguns)
nattbts[,2] <- as.vector(qrslt$nmvehicles)
# Create identification codes.
idcode <- paste("h",1:nmrecs,sep="")
# Write id code and non-secure attributes.
cat("idcode nmguns nmvehicles contact1 contact2","\n",file="qanalysis.dat",
   append=T)
for (i in 1:nmrecs) {
   contact1 <- NA
   if (!is.na(cattbts[i,3])) {
      for (j in 1:nmrecs) {
         if (cattbts[j,1] == cattbts[i,3]) contact1 <- idcode[j]
      }
   } 
   contact2 <- NA
   if (!is.na(cattbts[i,4])) {
      for (j in 1:nmrecs) {
         if (cattbts[j,1] == cattbts[i,4]) contact2 <- idcode[j]
      }
   } 
   cat(idcode[i]," ",nattbts[i,1]," ",nattbts[i,2]," ",contact1," ",
       contact2,"\n",
      file="qanalysis.dat", append=T)
}
# Compute and write similarity measures.
# The function "adist(str1,str2)" returns the "generalized Levenshtein
# (edit) distance between str1 and str2.
cat("idcodei idcodej namesim townsim contactssim","\n",
   file="qanalysis.dat", append=T)
for (i in 1:(nmrecs - 1)) {
   for (j in (i + 1):nmrecs) {
      str1 <- cattbts[i,1]
      str2 <- cattbts[j,1]
      namesim <- 1. - (adist(str1,str2) /
                       max(nchar(str1),nchar(str2)))
      str1 <- cattbts[i,2]
      str2 <- cattbts[j,2]
      townsim <- 1. - (adist(str1,str2) /
                       max(nchar(str1),nchar(str2)))
      cvec1 <- c(cattbts[i,3],cattbts[i,4])
      cvec2 <- c(cattbts[j,3],cattbts[j,4])
      contactssim <- jaccardsim(cvec1, cvec2)
      cat(idcode[i]," ",idcode[j]," ",namesim," ",townsim," ",
         contactssim,"\n",
         file="qanalysis.dat", append=T)
   }
}
}
