###Make sure the library(clusterSim) with all its dependencies is properly installed ###Make sure R is using as its working folder the folder with the files: ### "gap.statistic.r", and "data.csv" or your own data. setwd("E://Work/KMeans/supplement") source("gap.statistic.r") ##this function loads the function "index.Gap.modif" data <- read.csv("data.csv", header=T, sep=",") ##load data x <- data[,c("ACT1", "ACT2", "SL_METERS", "TURN_DEGRE")] ###take the absolute value from the turning angle, as the direction of turn is not of interest here x$TA <- abs(x$TURN_DEGRE) x$TURN_DEGRE <- NULL ###visual inspection of the raw data par(mfrow=c(2,2)) for (i in 1:ncol(x)){ hist(x[,i]) } ###log-transformation of all variables except TA for (i in 1:3){x[,i] <- log(x[,i]+1)} ###range standardization of all variables resc <- function(x){(x-min(x))/(max(x)-min(x))} for (i in 1:ncol(x)){x[,i] <- resc(x[,i])} ###visual inspection of the transformed data par(mfrow=c(2,2)) for (i in 1:ncol(x)){ hist(x[,i]) } x <- x[,c(1, 2, 3, 4)] ##use all variables #x <- x[,c(3, 4)] ##use only trajectory measures #x <- x[,c(1, 2)] ##use only activity measures ###To allow you to obtain exactly the same results as in the paper: ##it is recommanded to vary this value, to check the effect of the inital random values set.seed(1) res <- data.frame(GAP=NA, s=NA, Wo=NA, We=NA) for (i in 1:10){ ##determine the GAP-statistic for 1 to 10 clusters if (i==1) { ##clall is the vector (in matrix format) of integers indicating the group to which each datum is assigned ones <- rep(1, nrow(x)) clall<-matrix(ones)} if (i>1){ cl1<-kmeans(x,i, iter.max=100) clall<-matrix(cl1$cluster)} g<-index.Gap.modif(x, clall, reference.distribution="pc", B=50, method="k-means") res[i,] <- c(g$gap, g$s, g$Wo, g$We) } ###plot the GAP statistic for each number of clusters together with its standard error par(mfrow=c(1,1)) k <- seq(1:length(res$GAP)) plot(k, res$GAP, xlab="number of clusters k", ylab="GAP", main="GAP-statistic", type="b") segments(k, c(res$GAP-res$s), k, c(res$GAP+res$s)) kstar <- min(which(res$GAP[-length(res$GAP)] >= c(res$GAP-res$s)[-1])) kstar2 <- min(which(res$GAP[-length(res$GAP)] >= c(res$GAP-(1.96*res$s))[-1])) points(kstar2, res$GAP[kstar2], pch=22, bg="gray", cex=1.25) points(kstar, res$GAP[kstar], col="black", pch=19, cex=0.6)