One R Tip A Day: visualization

Visualizzazione post con etichetta visualization. Mostra tutti i post

mercoledì 27 luglio 2011

Word Cloud in R

A word cloud (or tag cloud) can be an handy tool when you need to highlight the most commonly cited words in a text using a quick visualization. Of course, you can use one of the several on-line services, such as wordle or tagxedo , very feature rich and with a nice GUI. Being an R enthusiast, I always wanted to produce this kind of images within R and now, thanks to the recently released Ian Fellows' wordcloud package, finally I can!
In order to test the package I retrieved the titles of the XKCD web comics included in my RXKCD package and produced a word cloud based on the titles' word frequencies calculated using the powerful tm package for text mining (I know, it is like killing a fly with a bazooka!).

library(RXKCD)
library(tm)
library(wordcloud)
library(RColorBrewer)
path <- system.file("xkcd", package = "RXKCD")
datafiles <- list.files(path)
xkcd.df <- read.csv(file.path(path, datafiles))
xkcd.corpus <- Corpus(DataframeSource(data.frame(xkcd.df[, 3])))
xkcd.corpus <- tm_map(xkcd.corpus, removePunctuation)
xkcd.corpus <- tm_map(xkcd.corpus, content_transformer(tolower))
xkcd.corpus <- tm_map(xkcd.corpus, function(x) removeWords(x, stopwords("english")))
tdm <- TermDocumentMatrix(xkcd.corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:2)]
png("wordcloud.png", width=1280,height=800)
wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
dev.off()

As a second example, inspired by this post from the eKonometrics blog, I created a word cloud from the description of 3177 available R packages listed at http://cran.r-project.org/web/packages.

require(XML)
require(tm)
require(wordcloud)
require(RColorBrewer)
u = "http://cran.r-project.org/web/packages/available_packages_by_date.html"
t = readHTMLTable(u)[[1]]
ap.corpus <- Corpus(DataframeSource(data.frame(as.character(t[,3]))))
ap.corpus <- tm_map(ap.corpus, removePunctuation)
ap.corpus <- tm_map(ap.corpus, content_transformer(tolower))
ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))

ap.corpus <- Corpus(VectorSource(ap.corpus))
ap.tdm <- TermDocumentMatrix(ap.corpus)
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
table(ap.d$freq)
pal2 <- brewer.pal(8,"Dark2")
png("wordcloud_packages.png", width=1280,height=800)
wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=3,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
dev.off()

As a third example, thanks to Jim's comment, I take advantage of Duncan Temple Lang's RNYTimes package to access user-generate content on the NY Times and produce a wordcloud of 'today' comments on articles.
Caveat: in order to use the RNYTimes package you need a API key from The New York Times which you can get by registering to the The New York Times Developer Network (free of charge) from here.

require(XML)
require(tm)
require(wordcloud)
require(RColorBrewer)
install.packages(packageName, repos = "http://www.omegahat.org/R", type = "source")
require(RNYTimes)
my.key <- "your API key here"
what= paste("by-date", format(Sys.time(), "%Y-%m-%d"),sep="/")
# what="recent"
recent.news <- community(what=what, key=my.key)
pagetree <- htmlTreeParse(recent.news, error=function(...){}, useInternalNodes = TRUE)
x <- xpathSApply(pagetree, "//*/body", xmlValue)
# do some clean up with regular expressions
x <- unlist(strsplit(x, "\n"))
x <- gsub("\t","",x)
x <- sub("^[[:space:]]*(.*?)[[:space:]]*$", "\\1", x, perl=TRUE)
x <- x[!(x %in% c("", "|"))]
ap.corpus <- Corpus(DataframeSource(data.frame(as.character(x))))
ap.corpus <- tm_map(ap.corpus, removePunctuation)
ap.corpus <- tm_map(ap.corpus, content_transformer(tolower))
ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english")))
ap.tdm <- TermDocumentMatrix(ap.corpus)
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
table(ap.d$freq)
pal2 <- brewer.pal(8,"Dark2")
png("wordcloud_NewYorkTimes_Community.png", width=1280,height=800)
wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=2,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
dev.off()

mercoledì 5 agosto 2009

Locate the position of CRAN mirror sites on a map using Google Maps

Inspired by this post (suggested here by the always useful Revolutions blog), I attempted to plot the position of CRAN mirrors on a map taking advantage of the nice R package RgoogleMaps (check the dependencies!). Below the code:

library(XML)
# download.file("http://www.maths.lancs.ac.uk/~rowlings/R/Cranography/cran.gml",destfile="cran.gml")
cran.gml <- xmlInternalTreeParse("cran.gml")
# Create a data.frame assembling all the information from the gml file
Name <- sapply(getNodeSet(cran.gml, "//ogr:Name"), xmlValue)
Country <- sapply(getNodeSet(cran.gml, "//ogr:Country"), xmlValue)
City <- sapply(getNodeSet(cran.gml, "//ogr:City"), xmlValue)
URL <- sapply(getNodeSet(cran.gml, "//ogr:URL"), xmlValue)
Host <- sapply(getNodeSet(cran.gml, "//ogr:Host"), xmlValue)
Maintainer <- sapply(getNodeSet(cran.gml, "//ogr:Maintainer"), xmlValue)
CountryCode <- sapply(getNodeSet(cran.gml, "//ogr:countryCode"), xmlValue)
lng <- as.numeric(sapply(getNodeSet(cran.gml, "//ogr:lng"), xmlValue))
lat <- as.numeric(sapply(getNodeSet(cran.gml, "//ogr:lat"), xmlValue))
cran.mirrors <- data.frame(Name, Country, City, URL, Host, Maintainer, CountryCode, lng, lat)
# cran.mirrors <- cbind(getCRANmirrors(), lng, lat) ## alternatively
library(RgoogleMaps)
# Define the markers:
cran.markers <- cbind.data.frame( lat=cran.mirrors$lat, lon=cran.mirrors$lng, 
size=rep('tiny', length(cran.mirrors$lat)), col=colors()[1:length(cran.mirrors$lat)], 
char=rep('',length(cran.mirrors$lat)) )
# Get the bounding box:
bb <- qbbox(lat = cran.markers[,"lat"], lon = cran.markers[,"lon"])
num.mirrors <- 1:dim(cran.markers)[1] ## to visualize only a subset of the cran.mirrors
maptype <- c("roadmap", "mobile", "satellite", "terrain", "hybrid", "mapmaker-roadmap", "mapmaker-hybrid")[1]
# Download the map (either jpg or png): 
MyMap <- GetMap.bbox(bb$lonR, bb$latR, destfile = paste("Map_", maptype, ".png", sep=""), GRAYSCALE=F, maptype = maptype)
# Plot:
png(paste("CRANMirrorsMap_", maptype,".png", sep=""), 640, 640)
tmp <- PlotOnStaticMap(MyMap,lat = cran.markers[num.mirrors,"lat"], lon = cran.markers[num.mirrors,"lon"], 
cex=1, pch="R",col=as.numeric(cran.mirrors$Country), add=F)
dev.off()

## Hosts from Italy
maptype <- c("roadmap", "mobile", "satellite", "terrain", "hybrid", "mapmaker-roadmap", "mapmaker-hybrid")[4]
num.it <- row.names(cran.mirrors[cran.mirrors$CountryCode=="IT",])
# Get the bounding box:
bb.it <- qbbox(lat = cran.markers[num.it,"lat"], lon = cran.markers[num.it,"lon"])
# Download the map (either jpg or png):
ITMap <- GetMap.bbox(bb.it$lonR, bb.it$latR, destfile = paste("ITMap_", maptype, ".png", sep=""), GRAYSCALE=F, maptype = maptype)
#ITMap <- GetMap.bbox(bb.it$lonR, bb.it$latR, destfile = paste("ITMap_", maptype, ".jpg", sep=""), GRAYSCALE=F, maptype = maptype)
# Plot:
png(paste("CRANMirrorsMapIT_", maptype,".png", sep=""), 640, 640);
tmp <- PlotOnStaticMap(ITMap,lat = cran.markers[num.it,"lat"], lon = cran.markers[num.it,"lon"], 
cex=2, pch="R",col="dodgerblue", add=F)
# tmp <- PlotOnStaticMap(ITMap,lat = cran.markers[num.it,"lat"], lon = cran.markers[num.it,"lon"],labels=as.character(cran.mirrors[cran.mirrors$CountryCode=="IT",]$Host),col="black", FUN=text, add=T)
dev.off()

CAVEAT: To reproduce the example you need the gml file you can download from here , a Google account and a Google Maps API key. Here you can sign up for a free API key.

sabato 6 giugno 2009

Two plot with a common legend - base graphics

If you need to share a common legend between two graphs using the ggplot2 package/paradigm take a look at this post from the Learning R blog.
The code below solves the same task using the R base graphics.

png( "2plot1legend.png", width = 480, height = 680)
par(mfrow = c(2, 1), oma = c(0, 0, 0, 2))
plot(hp~mpg, data=mtcars, col=cyl,pch=19)
plot(disp~wt, data=mtcars, col=cyl,pch=19)
par(xpd=NA)
#legend(locator(1), legend=as.numeric(levels(factor(mtcars$cyl))), pch=19, col= as.numeric(levels(factor(mtcars$cyl))) )
legend(x=5.6, y=690, legend=as.numeric(levels(factor(mtcars$cyl))), pch=19, col= as.numeric(levels(factor(mtcars$cyl))) )
dev.off()

mercoledì 25 marzo 2009

Alternative implementations using ggplot2

Here and here, you can find alternative implementations of two plots (1, 2) I created time ago using R basic graphic. The author recreates the plots taking advantage of the excellent ggplot2 package.

mercoledì 21 gennaio 2009

Radar chart

I thank David for the following example of radar chart:

corelations <- c(1:97)

corelation.names <- names(corelations) <- c("Alp12Mn",

"AvrROE", "DivToP", "GrowAPS", "GrowAsst", "GrowBPS", "GrowCFPS",

"GrowDPS", "GrowEPS", "GrowSPS", "HistAlp", "HistSigm", "InvVsSal",

"LevGrow", "Payout5", "PredSigm", "RecVsSal", "Ret12Mn", "Ret3Mn",

"Ret1Mn", "ROE", "_CshPlow", "_DDM", "_EarnMom", "_EstChgs",

"_EstRvMd", "_Neglect", "_NrmEToP", "_PredEToP", "_RelStMd", "_ResRev",

"_SectMom", "AssetToP", "ARM_Pref_Earnings", "AvrCFtoP", "AvrDtoP",

"AvrEtoP", "ARM_Sec_Earnings", "BondSens", "BookToP", "Capt",

"CaptAdj", "CashToP", "CshFlToP", "CurrSen", "DivCuts5", "EarnToP",

"Earnvar", "Earnyld", "Growth", "HistBeta", "IndConc", "Leveflag",

"Leverag", "Leverage", "Lncap", "Momentum", "Payoflag", "PredBeta",

"Ret_11M_Momentum", "PotDilu", "Price", "ProjEgro", "RecEPSGr",

"SalesToP", "Size", "SizeNonl", "Tradactv", "TradVol", "Value",

"VarDPS", "Volatility", "Yield", "CFROI", "ADJUST", "ERC", "RC", "SPX",

"R1000", "MarketCap", "TotalRisk", "Value_AX", "truncate_ret_1mo",

"truncate_PredSigma", "Residual_Returns", "ARM_Revenue",

"ARM_Rec_Comp", "ARM_Revisions_Comp", "ARM_Global_Rank", "ARM_Score",

"TEMP", "EQ_Raw", "EQ_Region_Rank", "EQ_Acc_Comp", "EQ_CF_Comp",

"EQ_Oper_Eff_Comp", "EQ_Exc_Comp")

corelations <- c(0.223, 0.1884, -0.131, 0.1287, 0.0307,

0.2003, 0.2280, 0.1599, 0.2680, 0.2596, 0.3399, 0.0324, 0.0382, -0.173,

-0.177, -0.056, -0.063, 0.2211, 0.0674, -0.023, 0.2641, 0.2369, 0.1652,

-0.023, 0.1070, 0.0791, -0.023, 0.0434, -0.002, -0.001, -0.000, -0.108,

-0.288, 0.1504, -0.127, -0.142, 0.0852, 0, -0.031, -0.320, 0.0785,

0.0465, -0.166, 0.1416, 0.0945, -0.063, 0.1461, -0.305, 0.1215, 0.0776,

0.0449, 0.0823, -0.018, -0.261, -0.318, 0.1194, 0.3151, -0.124, 0.1037,

0.2240, -0.115, 0.1543, 0, 0.1775, -0.153, 0.1194, 0.1407, 0.1047,

0.0926, -0.403, 0.0067, -0.048, -0.136, 0.1068, 0.0381, 0.1878, -0.035,

0.0761, 0.0784, 0, 0, 0, -0.018, 0.1602, 0.0543, 0, -0.013, 0.1439, 0,

0, -0.054, 0.7426, 0.7510, 0.1657, 0.1657, 0.4949, 1.0000)

require(plotrix)

par(ps=6)

radial.plot(corelations, labels=corelation.names,rp.type="p",main="Correlation Radar", radial.lim=c(-1,1),line.col="blue")

lunedì 5 gennaio 2009

Statistical Visualizations - Part 2

martedì 23 dicembre 2008

Statistical Visualizations

Inspired by this interesting post, I decided to reproduce some of the plots using R code.

The data are c & p from here:

>original
     Europe    Asia Americas Africa Oceania
1820-30   106487      36    11951     17   33333
1831-40   495681      53    33424     54   69911
1841-50  1597442     141    62469     55   53144
1851-60  2452577   41538    74720    210   29169
1861-70  2065141   64759   166607    312   18005
1871-80  2271925  124160   404044    358   11704
1881-90  4735484   69942   426967    857   13363
1891-00  3555352   74862    38972    350   18028
1901-10  8056040  323543   361888   7368   46547
1911-20  4321887  247236  1143671   8443   14574
1921-30  2463194  112059  1516716   6286    8954
1931-40   347566   16595   160037   1750    2483
1941-50   621147   37028   354804   7367   14693
1951-60  1325727  153249   996944  14092   25467
1961-70  1123492  427642  1716374  28954   25215
1971-80   800368 1588178  1982735  80779   41254
1981-90   761550 2738157  3615225 176893   46237
1991-00  1359737 2795672  4486806 354939   98263
2001-06  1073726 2265696  3037122 446792  185986

png("immigration_log_scatter_BW.png", width = 560, height = 480)
par( mar=c(7, 7, 3, 3) )
plot( original$Europe, log="y", type="l", col="grey20", lty=1,
ylim=c(10, 10000000), xlab="Year Interval", ylab="Number of Immigrants Admitted to the United States",
lwd=2, xaxt='n', yaxt='n', mgp=c(4.5,1,0) )  # xaxt='n' an d yaxt='n'- do not show x and y axis
for (i in 2:dim(original)[[2]]){
lines(original[, i], type="l", lty=i, col="grey20")
}
axis(1, 1:dim(original)[[1]], rownames(original), las=2)
axis(2, at=c(10,100,1000,10000,100000,1000000,10000000), labels=c(10,100,1000,10000,100000,1000000,10000000), las=2, tck=1, col="grey85")
box()
legend( 14,400, legend=colnames(original), lty=c(1:5) )
dev.off()

png("immigration_stacked_chart.png", width = 560, height = 480)
library(plotrix)
par( mar=c(6, 6, 3, 3) , las=1)
colori4<-c("yellow", "darkred","green","brown1", "steelblue")
stackpoly( original[, 5:1], col=smoothColors(colori4), border=NA,stack=T, xaxlab=rownames(original),
        ylim=c(10,10000000), staxx=TRUE, axis4=F, main="Immigration to the USA - 1821 to 2006" )
legend("topleft", legend=colnames(original), fill=smoothColors(colori4)[5:1] )
dev.off()