Additional material
by Mathieu Basille.
A set of functions to split year-round space-use measurements into biological seasons, completed with additional functions to explore and simplify these seasons. Reference: Basille M., Fortin D., Dussault C., Ouellet J.-P., Courtois R. Ecologically based definition of seasons clarifies predator-prey interactions. Ecography, in press.
The 'Gap' function correctly compute the gap statistic (weighted or not, weighted by default). The function works on a matrix of data and returns a data frame with different components:
- nCluster: k, the number of the cluster;
- logWk0: log(W_k) from the data, weighted or not;
- logWk: from the simulated data sets;
- Gap: the gap statistic;
- sdGap: the standard deviation of the gap statistic;
- k: the estimated number of clusters with the classical approach;
- D: differences of gap;
- DD: differences of Dgap;
- DDk: the estimated number of clusters with the DD-weighted method; the number of clusters $k$ is given when $DD$Gap is maximum.
Gap <- function (data, from = 1, to = 10, nsim = 50, ref.dist = "pc",
clust.method = "k-means", dist.method = "euclidean", weighted = TRUE,
tol = 1)
{
clust <- function(mat, nclust, clust.method) {
set.seed(1)
if (nclust == 1)
return(rep(1, nrow(mat)))
else if (clust.method == "pam")
return(pam(mat, nclust)$cluster)
else if (clust.method == "k-means")
return(kmeans(mat, nclust, 100)$cluster)
else if (clust.method == "diana")
return(cutree(as.hclust(diana(dist(mat))), k = nclust))
else if (clust.method %in% c("single", "complete", "average",
"ward", "mcquitty", "median", "centroid"))
return(cutree(hclust(dist(mat), method = method),
nclust))
else stop("Wrong clustering method")
}
computeWk <- function(mat, cl, dist.method = "euclidean",
weighted = TRUE) {
Wk <- 0
for (cli in unique(cl)) {
if (!weighted) {
if (dist.method == "euclidean")
Wk <- Wk + sum(diag(var(mat[cl == cli, ]))) *
(length(cl[cl == cli]) - 1)
else {
D <- sum(dist(mat[cl == cli, ], method = dist.method)^2)
Wk <- Wk + D/(nrow(mat[cl == cli, , drop = FALSE]))
}
}
else {
if (dist.method == "euclidean")
Wk <- Wk + sum(diag(var(mat[cl == cli, ])))
else {
D <- sum(dist(mat[cl == cli, ], method = dist.method)^2)
Wk <- Wk + D/(nrow(mat[cl == cli, , drop = FALSE]) *
(nrow(mat[cl == cli, , drop = FALSE]) - 1))
}
}
}
return(Wk)
}
simX <- function(mat, seed = seed, ref.dist = ref.dist) {
set.seed(seed)
simunif <- function(x) return(apply(x, 2, function(w) runif(length(w),
min = min(w), max = max(w))))
simpc <- function(x) {
x <- as.matrix(x)
xmm <- apply(x, 2, mean)
for (k in (1:dim(x)[2])) x[, k] <- x[, k] - xmm[k]
ss <- svd(x)
xsim <- simunif(x %*% ss$v) %*% t(ss$v)
for (k in (1:dim(x)[2])) xsim[, k] <- xsim[, k] +
xmm[k]
return(xsim)
}
if (ref.dist == "unif")
return(simunif(mat))
if (ref.dist == "pc")
return(simpc(mat))
}
Gap <- data.frame(matrix(NA, nrow = to - from + 1, ncol = 9))
names(Gap) <- c("nCluster", "logWk0", "logWk", "Gap", "sdGap",
"k", "D", "DD", "DDk")
for (i in from:to) {
Gap[i, "nCluster"] <- i
Gap[i, "logWk0"] <- log(computeWk(data, clust(data, i,
clust.method = clust.method), dist.method = dist.method,
weighted = weighted))
Sim <- lapply(1:nsim, function(seed) simX(data, seed = seed,
ref.dist = ref.dist))
logWksim <- log(unlist(lapply(Sim, function(xl) computeWk(xl,
clust(xl, i, clust.method = clust.method), dist.method = dist.method,
weighted = weighted))))
Gap[i, "logWk"] <- mean(logWksim)
Gap[i, "Gap"] <- Gap[i, "logWk"] - Gap[i, "logWk0"]
Gap[i, "sdGap"] <- sqrt(1 + 1/nsim) * sqrt(var(logWksim) *
(nsim - 1)/nsim)
}
Gap$k <- ""
Gap$k[min(which(Gap$Gap[-nrow(Gap)] >= c(Gap$Gap - (tol *
Gap$sdGap))[-1]))] <- "*"
Gap$D[2:nrow(Gap)] <- diff(Gap$Gap)
Gap$DD[2:(nrow(Gap) - 1)] <- Gap$D[2:(nrow(Gap) - 1)] - Gap$D[3:nrow(Gap)]
Gap$DDk <- ""
Gap$DDk[which(Gap$DD == max(Gap$DD, na.rm = TRUE))] <- "*"
return(Gap)
}
The function 'GapPlot' plots W_k, the gap statistic, DGap and DDGap of a given clustering, with the estimated number of clusters for each method.
GapPlot <- function (x)
{
old.par <- par(no.readonly = TRUE)
on.exit(par(old.par))
par(mfrow = c(2, 2))
k <- seq(1:nrow(x))
plot(k, exp(x$logWk0), xlab = "Number of clusters k", ylab = "Wk",
type = "b")
plot(k, x$Gap, xlab = "Number of clusters k", ylab = "GAP",
type = "b")
segments(k, c(x$Gap - x$sdGap), k, c(x$Gap + x$sdGap))
points(x$nCluster[x$k == "*"], x$Gap[x$k == "*"], col = "black",
pch = 19, cex = 1.5)
plot(k, x$D, xlab = "Number of clusters k", ylab = "D", type = "b")
plot(k, x$DD, xlab = "Number of clusters k", ylab = "DD",
type = "b")
points(x$nCluster[x$DDk == "*"], x$DD[x$DDk == "*"], col = "black",
pch = 19, cex = 1.5)
}
In addition, given a clustering, the function 'Seasons' simplify it to remove the smallest seasons, by checking for each day the values of the next 'win' days: if they have the same value (with a tolerance of 'tol' days), the day is kept as is; otherwise, the given day takes the value of the day before. 'SPlot' and 'Sformat' produces a plot and a human-readable output of the clustering, while 'SBoxplot' displays boxplots for every season (in their order by default).
Seasons <- function (clust, win = 3, tol = 1)
{
clust <- c(clust[length(clust)], clust, clust[1:win])
for (i in 2:(length(clust) - win)) if (sum(clust[(i + 1):(i +
win)] == clust[i]) < (win - tol))
clust[i] <- clust[i - 1]
for (i in 2:(length(clust) - win)) if (sum(clust[(i + 1):(i +
win)] == clust[i]) < (win - tol))
clust[i] <- clust[i - 1]
return(clust[-c(1, (length(clust) - win + 1):length(clust))])
}
SPlot <- function (seasons)
{
old.par <- par(no.readonly = TRUE)
on.exit(par(old.par))
par(mfrow = c(1, 1), mar = c(4, 3, 4, 2) + 0.1)
plot(seasons, pch = 15, cex = 0.5, axes = FALSE, main = "Seasons",
xlab = "")
abline(v = as.numeric(names(which(diff(seasons) != 0))),
lty = 3, lwd = 0.5)
axis(1, at = as.numeric(names(which(diff(seasons) != 0))),
label = format(strptime(names(which(diff(seasons) !=
0)), format = "%j"), "%d/%m"), cex.axis = 0.8, las = 2)
axis(2)
box()
}
SFormat <- function (seasons, ndays = FALSE, nseasons = FALSE, trunk = FALSE)
{
br <- names(which(diff(seasons) != 0))
if (trunk) {
br <- c(names(seasons)[1], br, names(seasons)[length(seasons)])
brn <- c("start", seasons[br[-c(1, length(br))]], "end")
}
else brn <- seasons[br]
Sbr <- format(strptime(br, format = "%j"), "%d/%m")
names(Sbr) <- brn
if (ndays)
return(as.numeric(br))
if (nseasons)
return(seasons[br])
else return(Sbr)
}
SBoxplot <- function (data, seasons, order = TRUE, months = c("rectangles",
"lines"), cluster = TRUE, multi = FALSE, samescale = TRUE)
{
old.par <- par(no.readonly = TRUE)
if (multi)
par(mfcol = c(ncol(data[[1]]), length(data)), mar = c(1,
2, 3, 0) + 0.1)
else {
par(mfrow = n2mfrow(ncol(data)), mar = c(1, 2, 3, 0) +
0.1)
data <- list(data)
seasons <- list(seasons)
}
on.exit(par(old.par))
months <- match.arg(months)
if (order) {
for (j in 1:length(data)) {
datarb <- do.call(rbind, data)
datatmp <- data[[j]]
seasonstmp <- seasons[[j]]
changes <- as.numeric(c(names(seasonstmp)[1], names(which(diff(seasonstmp) !=
0)), names(seasonstmp)[length(seasonstmp)]))
seas <- seasonstmp[as.character(changes[-length(changes)])]
at <- changes[-length(changes)] + diff(changes)/2
for (i in 1:ncol(datatmp)) {
summ <- do.call(rbind, lapply(1:max(seasonstmp),
function(j) summary(datatmp[seasonstmp == j,
i])))
if (samescale) {
plot(1:365, datatmp[, i], type = "n", xlim = c(1,
365), ylim = range(datarb[, i]), axes = FALSE,
main = names(datatmp)[i])
par(usr = c(1, 365, min(datarb[, i]), max(datarb[,
i])))
}
else {
plot(1:365, datatmp[, i], type = "n", xlim = c(1,
365), ylim = range(datatmp[, i]), axes = FALSE,
main = names(datatmp)[i])
par(usr = c(1, 365, min(datatmp[, i]), max(datatmp[,
i])))
}
if (months == "lines")
abline(v = c(32, 60, 91, 121, 152, 182, 213,
244, 274, 305, 335), lty = 3, col = grey(0.6))
if (months == "rectangles")
rect(c(32, 91, 152, 213, 274, 335), min(datarb[,
i]), c(60, 121, 182, 244, 305, 365), max(datarb[,
i]), col = grey(0.9), border = NA)
segments(at, summ[seas, "Min."], y1 = summ[seas,
"Max."], lty = 2)
for (j in 1:length(seas)) polygon(x = changes[c(j,
j + 1, j + 1, j)] + c(1, -1, -1, 1), y = rep(summ[seas[j],
c("1st Qu.", "3rd Qu.")], each = 2), col = "white")
segments(changes[-length(changes)] + 1, summ[seas,
"Median"], changes[-1] - 1, lwd = 3)
axis(2)
axis(1, at = c(16, 46, 75.5, 106, 136.5, 167,
197.5, 228.5, 259, 289.5, 320, 350), labels = c("J",
"F", "M", "A", "M", "J", "J", "A", "S", "O",
"N", "D"), tick = FALSE, line = -1, cex.axis = 0.8)
if (cluster)
axis(3, at = at, labels = seas, tick = FALSE,
line = -1, cex.axis = 0.8)
box()
}
}
}
else {
for (j in 1:length(data)) {
datatmp <- data[[j]]
seasonstmp <- seasons[[j]]
for (i in 1:ncol(data)) {
boxplot(data[, i] ~ seasons, axes = FALSE, main = names(data)[i],
ylim = c(0, 1))
axis(1)
box()
}
}
}
}