giovedì 10 gennaio 2008

Hello World for Clustering methods

A hello world program can be a useful sanity test to make sure that the procedure/methods you are analyzing "works" at least for very basic tasks. For this purpose, I create an artificial data set from 4 different 2-dimensional normal distributions to check how well the 4 clusters can be recognized by common clustering methods.

set1 <- matrix(cbind(rnorm(100,0,2),rnorm(100,0,2)),100,2)
set2 <- matrix(cbind(rnorm(100,0,2),rnorm(100,8,2)),100,2)
set3 <- matrix(cbind(rnorm(100,8,2),rnorm(100,0,2)),100,2)
set4 <- matrix(cbind(rnorm(100,8,2),rnorm(100,8,2)),100,2)
dati <- list(values=rbind(set1,set2,set3,set4),classes=c(rep(1,100),rep(2,100),rep(3,100),rep(4,100))) # clustering - common methods
op <- par(mfcol = c(2, 2))
par(las =1)
plot(dati$values, col = as.integer(dati$classes), xlim=c(-6,14), ylim = c(-6,14), xlab="", ylab="", main = "True Groups")
party <- kmeans(dati$values,4)
plot(dati$values, col = party$cluster, xlab = "", ylab = "", main = "kmeans")
hc = hclust(dist(dati$values), method = "ward")
memb <- cutree(hc, k = 4)
plot(dati$values, col = memb, xlab = "", ylab = "", main = "hclust Euclidean ward") hc = hclust(dist(dati$values), method = "complete")
memb <- cutree(hc, k = 4)
plot(dati$values, col = memb, xlab = "", ylab = "", main = "hclust Euclidean complete")
par(op)

lunedì 26 novembre 2007

This is one of that tiny function that help you to save yourself a lot of time! I thank the list for the suggestion!

library(gdata)
?keep
data(women, cars)
keep(cars)
## To remove all objects except cars, run:
## keep(cars, sure=TRUE)

martedì 16 ottobre 2007

Duplicate a figure with R

The following code attempts to reproduce the Figure 3 (top) in Liao L, Noble WS.
Combining pairwise sequence similarity and support vector machines for detecting remote protein evolutionary and structural relationships. Journal of computational biology. 2003;10(6):857-68 using only the Base graphics system in R.
Data can be downloaded from here (if it doesn't work, use Google's cache).
I imported the file in a spreadsheet and then I copied & pasted it in R.

jcb.scores = read.delim("clipboard")
attach(jcb.scores)
pdf("recomb_scores.pdf")
par(las =1) # To have horizontal labels for axes 2 and 4
plot(y~sort(SVM.pairwise.ROC, decreasing = TRUE), pch = 3, cex = 0.5,
xlab = "AUC", ylab = "Number of families", axes = FALSE,
xlim = c(0,1), ylim = c(0,60))
lines(y~sort(SVM.pairwise.ROC, decreasing = TRUE), lty = 1)
points(y~sort(FPS.ROC, decreasing = TRUE), pch = 4, cex = 0.5)
lines(y~sort(FPS.ROC, decreasing = TRUE), lty = 2)
points(y~sort(SVM.Fisher.ROC, decreasing = TRUE), pch = 8, cex = 0.5)
lines(y~sort(SVM.Fisher.ROC, decreasing = TRUE), lty = 3)
points(y~sort(SAM.ROC, decreasing = TRUE), pch = 0, cex = 0.5)
lines(y~sort(SAM.ROC, decreasing = TRUE), lty = 4)
points(y~sort(PSI.BLAST.ROC, decreasing = TRUE), pch = 15, cex = 0.5)
lines(y~sort(PSI.BLAST.ROC, decreasing = TRUE), lty = 5)
axis(1, at = seq(0,1,0.2), labels = c(0,0.2,0.4,0.6,0.8,1), tcl = 0.25, pos = 0) # tcl = 0.25 small ticks toward the curve
axis(2, at = c(0,10,20,30,40,50,60), labels=c(0,10,20,30,40,50,60), tcl= 0.25 , pos = 0)
axis(2, at = c(0,10,20,30,40,60), tcl= 0.25,labels = F, pos = 0)
axis(3, tick = T, tcl= 0.25, labels = F, pos = 60)
axis(4, at = c(0,10,20,30,40,50), tcl= 0.25, labels = F, pos = 1)
axis(4, at = c(0,10,20,30,40,60), tcl= 0.25, labels = F, pos = 1)
# To locate the legend interactively
xy.legend = locator(1)
# right-justifying a set of labels: thanks to Uwe Ligges
temp <- legend(xy.legend, legend = c("SVM-pairwise", "FPS","SVM-Fisher", "SAM","PSI-BLAST"), text.width = strwidth("SVM-pairwise"), xjust = 1, yjust = 1, lty = c(1,2,3,4,5), pch = c(3,4,8,0,15), bty = "n", cex = 0.8, title = "")
dev.off()
detach(jcb.scores)


The original image:

My version of the image (not exacly identical but...):

venerdì 14 settembre 2007

Plotting two or more overlapping density plots on the same graph

This post was updated.
See this thread from StackOverflow for other ways to solve this task.

plot.multi.dens <- function(s)
{
junk.x = NULL
junk.y = NULL
for(i in 1:length(s))
{
junk.x = c(junk.x, density(s[[i]])$x)
junk.y = c(junk.y, density(s[[i]])$y)
}
xr <- range(junk.x)
yr <- range(junk.y)
plot(density(s[[1]]), xlim = xr, ylim = yr, main = "")
for(i in 1:length(s))
{
lines(density(s[[i]]), xlim = xr, ylim = yr, col = i)
}
}
#usage:
x = rnorm(1000,0,1)
y = rnorm(1000,0,2)
z = rnorm(1000,2,1.5)
# the input of the following function MUST be a numeric list
plot.multi.dens(list(x,y,z))
library(Hmisc)
le <- largest.empty(x,y,.1,.1)
legend(le,legend=c("x","y","z"), col=(1:3), lwd=2, lty = 1)

giovedì 9 agosto 2007

R package installation and administration

A short list of basic but useful commands for managing
the packages in R:

# install a package
install.packages("ROCR")
# visualize package version
package_version("pamr")
# update a package
update.packages("Cairo")
# remove a package
remove.packages("RGtk2")

venerdì 3 agosto 2007

Sorting/ordering a data.frame according specific columns


x = rnorm(20)
y = sample(rep(1:2, each = 10))
z = sample(rep(1:4, 5))

data.df <- data.frame(values = x, labels.1 = y, labels.2 = z)
print(data.df)

# data ordered according to "labels.1" column
# and then "labels.2" column
nams <- c("labels.1", "labels.2")
data.df.sorted = data.df[do.call(order, data.df[nams]), ]
print(data.df.sorted)

giovedì 2 agosto 2007

Receiver Operating Characteristic (ROC) Curve in ROCR and verification packages

The following VERY basic code shows how to plot a simple ROC Curve both by means of ROCR package and by verification package.

# it allows two different plots in the same frame
par(mfrow = c(1,2))
# plot a ROC curve for a single prediction run
# and color the curve according to cutoff.
library(ROCR)
data(ROCR.simple)
pred <- prediction(ROCR.simple$predictions, ROCR.simple$labels)
perf <- performance(pred,"tpr", "fpr")
plot(perf,colorize = TRUE)
# plot a ROC curve for a single prediction run
# with CI by bootstrapping and fitted curve
library(verification)
roc.plot(ROCR.simple$labels,ROCR.simple$predictions, xlab = "False positive rate",
ylab = "True positive rate", main = NULL, CI = T, n.boot = 100, plot = "both", binormal = TRUE)