Modeling and outlier detection
In this section we present the script used to create a model and then use this model to find outliers.
To learn a model from a cluster apps, we use the following R script. It produces the frequency vectory and the cut-off/threshold. It expects the list of apps in a given cluster, the list of unique 'exposed' APIs in a cluster, output filename for the average frequency vector, output filename for the threshold/cutoff value and the static analysis result of each app in the cluster.
args = commandArgs(trailingOnly=TRUE)
if (length(args) < 5) {
stop("Usage: apps_list.csv apis_in_cluster.csv ave_out_filename cutoff_filename cluster_files",
call.=FALSE)
}
wd <- getwd()
tryCatch(apps <- read.table(paste(wd, args[1], sep="/"), header=FALSE, sep="\n"),
error=function(e) NULL)
tryCatch(apis <- read.csv(paste(wd, args[2], sep="/"), header=FALSE, sep = "\n"),
error=function(e) NULL)
apps.names <- apps
apis.name <- apis
apps.names$ID <- 1:nrow(apps)
apis.name$ID <- seq.int(nrow(apis))
data <- data.frame(matrix(0, nrow(apps), nrow(apis)))
files <- list.files(path=paste(wd, args[5], sep="/"))
curr_dir <- getwd()
setwd(paste(wd, args[5], sep="/"))
rownames(data) <- apps.names$ID
colnames(data) <- apis.name$ID
for (i in 1:length(files)) {
file = files[i]
d1 <- NULL
tryCatch(d1 <- read.table(file, header=FALSE, quote="\"", sep="\n"), error=function(e) NULL)
for (anApi in d1[,1]){
id = apis.name$ID[apis.name==as.character(anApi)]
data[i, id] <- 1
rm(id)
}
rm(anApi,d1,file)
}
rm(i)
average = rep(0, length(apis.name$ID))
total = average
for(i in 1:length(average)){
average[i] = mean(data[,i])
total[i] = sum(data[,i])
}
rm(i)
distances = rep(0, length(apps.names$ID))
for(i in 1:length(distances)){
distance <- sqrt( sum((data[i,] - average) ^ 2) )
distances[i] <- distance
rm(distance)
}
rm(i)
bp <- boxplot(distances)
setwd(curr_dir)
write.csv(average,paste(wd, args[3], sep="/"),row.names=FALSE)
write.csv(bp$stats[5,1], paste(wd, args[4], sep="/"), row.names=FALSE)
With this, we have a model of each cluster that we are interested in.
Once we created a model, assuming we already performed the cluster assignment (using Weka), we perform outlier detection. We use the following R script to do outlier detection.
The script expects the list of APIs used by the given app, the list of APIs in the given cluster, the average frequency vector for the cluster, the threshold for the cluster and output file.
args = commandArgs(trailingOnly=TRUE)
if (length(args) < 6) {
stop("Usage: apps_api_usage.txt apis_in_cluster.csv average.txt cutoff.txt output_file", call.=FALSE)
}
wd <- getwd()
cutoff <- read.table(paste(wd, args[4], sep="/"), header=TRUE, quote="\"", sep="\n")
average <- read.table(paste(wd, args[3], sep="/"), header=TRUE, quote="\"", sep="\n")
app <- args[1]
apis <- read.table(paste(wd, args[2], sep="/"), header=FALSE, sep = "\n")
output_file <- paste(wd, args[6], sep="/")
apps.names <- data.frame(app)
apis.name <- apis
apps.names$ID <- 1
apis.name$ID <- seq.int(nrow(apis))
data <- data.frame(matrix(0, nrow(apps.names), nrow(apis)))
rownames(data) <- apps.names$ID
colnames(data) <- apis.name$ID
newAPI=0
file = paste(wd, args[5], sep="/")
d1 <- read.table(file, header=FALSE, sep="\n")
for (anApi in d1[,1]){
id = apis.name$ID[apis.name==as.character(anApi)]
if (length(data[1, id]) != 0) {
data[1, id] <- 1
}
rm(id)
}
rm(anApi,file)
averages = t(average)
distance <- sqrt( sum((data[1,] - averages) ^ 2) )
print(paste("Distance:",distance,"---cutoff:",cutoff,sep=" "))
if (distance < as.double(cutoff) && newAPI != 1) {
quit()
} else {
med = median(averages[1,])
for (anApi in d1[,1]){
id = apis.name$ID[apis.name==as.character(anApi)]
mi = averages[1, as.numeric(id)]
if (length(mi) == 0) {
write(as.character(anApi), output_file, append=TRUE)
next
}
if (mi <= med) {
write(as.character(anApi), output_file, append=TRUE)
}
}
}
This script produces the list of anomaluous APIs in the app. The following tools are provided as JAR file.