lattice
Plotting Systemggplot2
Plotting Systemsummary(data)
= returns min, 1st quartile, median, mean, 3rd quartile, maxboxplot(data, col = “blue”)
= produces a box with middles 50% highlighted in the specified color
whiskers
= \(\pm 1.58IQR/\sqrt{n}\)
box
= 25%, median, 75%histograms(data, col = “green”)
= produces a histogram with specified breaks and color
breaks = 100
= the higher the number is the smaller/narrower the histogram columns arerug(data)
= density plot, add a strip under the histogram indicating location of each data pointbarplot(data, col = wheat)
= produces a bar graph, usually for categorical data
abline(h/v = 12)
= overlays horizontal/vertical line at specified location
col = “red”
= specifies colorlwd = 4
= line widthlty = 2
= line typeboxplot(pm25 ~ region, data = pollution, col = “red”)
par(mfrow = c(2, 1), mar = c(4, 4, 2, 1))
= set marginhist(subset(pollution, region == "east")$pm25, col = "green")
= first histogramhist(subset(pollution, region == "west")$pm25, col = "green")
= second histogramwith(pollution, plot(latitude, pm25, col = region))
abline(h = 12, lwd = 2, lty = 2)
= plots horizontal dotted lineplot(jitter(child, 4)~parent, galton)
= spreads out data points at the same position to simulate measurement error/make high frequency more visibblepar(mfrow = c(1, 2), mar = c(5, 4, 2, 1))
= sets marginswith(subset(pollution, region == "west"), plot(latitude, pm25, main = "West"))
= left scatterplotwith(subset(pollution, region == "east"), plot(latitude, pm25, main = "East"))
= right scatterplotplot(x, y)
or hist(x)
will launch a graphics device and draw a plot on device
?par
”airquality <- transform(airquality, Month = factor(month))
pch
: plotting symbol (default = open circle)lty
: line type (default is solid)
lwd
: line width (integer)col
: plotting color (number string or hexcode, colors() returns vector of colors)xlab
, ylab
: x-y label character stringscex
: numerical value giving the amount by which plotting text/symbols should be magnified relative to the default
cex = 0.15 * variable
: plot size as an additional variablepar()
function = specifies global graphics parameters, affects all plots in an R session (can be overridden)
las
: orientation of axis labelsbg
: background colormar
: margin size (order = bottom left top right)oma
: outer margin size (default = 0 for all sides)mfrow
: number of plots per row, column (plots are filled row-wise)mfcol
: number of plots per row, column (plots are filled column-wise)par("parameter")
lines
: adds liens to a plot, given a vector of x values and corresponding vector of y valuespoints
: adds a point to the plottext
: add text labels to a plot using specified x,y coordinatestitle
: add annotations to x,y axis labels, title, subtitles, outer marginmtext
: add arbitrary text to margins (inner or outer) of plotaxis
: specify axis tickslibrary(datasets)
# type =“n” sets up the plot and does not fill it with data
with(airquality, plot(Wind, Ozone, main = "Ozone and Wind in New York City", type = "n"))
# subsets of data are plotted here using different colors
with(subset(airquality, Month == 5), points(Wind, Ozone, col = "blue"))
with(subset(airquality, Month != 5), points(Wind, Ozone, col = "red"))
legend("topright", pch = 1, col = c("blue", "red"), legend = c("May", "Other Months"))
model <- lm(Ozone ~ Wind, airquality)
# regression line is produced here
abline(model, lwd = 2)
example(points)
in R will launch a demo of base plotting system and may provide some helpful tips on graphing # this expression sets up a plot with 1 row 3 columns, sets the margin and outer margins
par(mfrow = c(1, 3), mar = c(4, 4, 2, 1), oma = c(0, 0, 2, 0))
with(airquality, {
# here three plots are filled in with their respective titles
plot(Wind, Ozone, main = "Ozone and Wind")
plot(Solar.R, Ozone, main = "Ozone and Solar Radiation")
plot(Temp, Ozone, main = "Ozone and Temperature")
# this adds a line of text in the outer margin*
mtext("Ozone and Weather in New York City", outer = TRUE)}
)
quartz()
on Mac, windows()
on Windows, x11()
on Unix/Linux?Devices
= lists devices founddev.off()
pdf
: useful for line type graphics, resizes well, usually portable, not efficient if too many pointssvg
: XML based scalable vector graphics, support animation and interactivity, web basedwin.metafile
: Windows metafile formatpostscript
: older format, resizes well, usually portable, can create encapsulated postscript file, Windows often don’t have postscript viewer (postscript = predecessor of PDF)png
: Portable Network Graphics, good for line drawings/image with solid colors, uses lossless compression, most web browsers read this natively, good for plotting a lot of data points, does not resize wellJPEG
: good for photographs/natural scenes/gradient colors, size efficient, uses lossy compression, good for plotting many points, does not resize well, can be read by almost any computer/browser, not great for line drawings (aliasing on edges)tiff
: common bitmap format supports lossless compressionbmp
: native Windows bitmapped formatdev.cur()
= returns the currently active devicedev.set(<integer>)
= change the active graphics device <integer>
= number associated with the graphics device you want to switch todev.copy()
= copy a plot from one device to anotherdev.copy2pdf()
= specifically for copying to PDF files## Create plot on screen device
with(faithful, plot(eruptions, waiting))
## Add a main title
title(main = "Old Faithful Geyser data")
## Copy my plot to a PNG file
dev.copy(png, file = "geyserplot.png")
## Don't forget to close the PNG device!
dev.off()
lattice
Plotting Systemlibrary(lattice)
= load lattice systemlattice
and grid
packages
lattice
package = contains code for producing Trellis graphics (independent from base graphics system)grid
package = implements the graphing system; lattice build on top of gridpanel
functions can be specified/customized to modify the subplotstrellis.par.set()
\(\rightarrow\) can be used to set global graphic parameters for all trellis objectslattice
Functions and Parametersxyplot()
= main function for creating scatterplotsbwplot()
= box and whiskers plots (box plots)histogram()
= histogramsstripplot()
= box plot with actual pointsdotplot()
= plot dots on “violin strings”splom()
= scatterplot matrix (like pairs() in base plotting system)levelplot()
/contourplot()
= plotting image dataxyplot(y ~ x | f * g, data, layout, panel)
~
) = left hand side is the y-axis variable, and the right hand side is the x-axis variablef
/g
= conditioning/categorical variables (optional)
*
indicates interaction between two variablesf
and g
data
= the data frame/list from which the variables should be looked up
layout
= specifies how the different plots will appear
layout = c(5, 1)
= produces 5 subplots in a horizontal fashionpanel
function can be added to control what is plotted inside each panel of the plot
panel
functions receive x/y coordinates of the data points in their panel (along with any additional arguments)?panel.xyplot
= brings up documentation for the panel functionslattice
Examplelibrary(lattice)
set.seed(10)
x <- rnorm(100)
f <- rep(0:1, each = 50)
y <- x + f - f * x+ rnorm(100, sd = 0.5)
f <- factor(f, labels = c("Group 1", "Group 2"))
## Plot with 2 panels with custom panel function
xyplot(y ~ x | f, panel = function(x, y, ...) {
# call the default panel function for xyplot
panel.xyplot(x, y, ...)
# adds a horizontal line at the median
panel.abline(h = median(y), lty = 2)
# overlays a simple linear regression line
panel.lmline(x, y, col = 2)
})
ggplot2
Plotting Systemlibrary(ggplot2)
= loads ggplot2 package“In brief, the grammar tells us that a statistical graphic is a mapping from data to aesthetic attributes (color, shape, size) of geometric objects (points, lines, bars). The plot may also contain statistical transformations of the data and is drawn on a specific coordinate system”
ggplot2
Functions and Parametersggplot2
graphic
qplot(x, y, data , color, geom)
= quick plot, analogous to base system’s plot()
function
color = factor1
= use the factor variable to display subsets of data in different colors on the same plot (legend automatically generated)shape = factor2
= use the factor variable to display subsets of data in different shapes on the same plot (legend automatically generated)library(ggplot2)
qplot(displ, hwy, data = mpg, color = drv, shape = drv)
geom = c("points", "smooth")
= add a smoother/“low S”
method = "lm"
= additional argument method can be specified to create different lines/confidence intervals
lm
= linear regressionqplot(displ, hwy, data = mpg, geom = c("point", "smooth"), method="lm")
fill = factor1
= can be used to fill the histogram with different colors for the subsets (legend automatically generated)qplot(hwy, data = mpg, fill = drv)
facets = rows ~ columns
= produce different subplots by factor variables specified (rows/columns)"."
indicates there are no addition row or columnfacets = . ~ columns
= creates 1 by col subplotsfacets = row ~ .
= creates row row by 1 subplotsqplot(displ, hwy, data = mpg, facets = . ~ drv)
qplot(hwy, data = mpg, facets = drv ~ ., binwidth = 2)
geom = "density"
= replaces the default scatterplot with density smooth curveggplot()
g <- ggplot(data, aes(var1, var2))
ggplot
and specifies the data frame that will be usedaes(var1, var2)
= specifies aesthetic mapping, or var1 = x variable, and var2 = y variablesummary(g)
= displays summary of ggplot objectprint(g)
= returns error (“no layer on plot”) which means the plot does know how to draw the data yetg + geom_point()
= takes information from g object and produces scatter plot+ geom_smooth()
= adds low S mean curve with confidence interval
method = "lm"
= changes the smooth curve to be linear regressionsize = 4
, linetype = 3
= can be specified to change the size/style of the linese = FALSE
= turns off confidence interval+ facet_grid(row ~ col)
= splits data into subplots by factor variables (see facets from qplot()
)
cutPts <- quantiles(df$cVar, seq(0, 1, length=4), na.rm = TRUE)
= creates quantiles where the continuous variable will be cut
seq(0, 1, length=4)
= creates 4 quantile pointsna.rm = TRUE
= removes all NA valuesdf$newFactor <- cut(df$cVar, cutPts)
= creates new categorical/factor variable by using the cutpoints
xlab()
, ylab()
, labs()
, ggtitle()
= for labels and titles
labs(x = expression("log " * PM[2.5]), y = "Nocturnal")
= specifies x and y labelsexpression()
= used to produce mathematical expressionsgeom
functions = many options to modifytheme()
= for global changes in presentation
theme(legend.position = "none")
theme_gray()
and theme_bw()
base_family = "Times"
= changes font to Times+ geom_point(color, size, alpha)
= specifies how the points are supposed to be plotted on the graph (style)
color = "steelblue"
= specifies color of the data pointsaes(color = var1)
= wrapping color argument this way allows a factor variable to be assigned to the data points, thus subsetting it with different colors based on factor variable valuessize = 4
= specifies size of the data pointsalpha = 0.5
= specifies transparency of the data points+ ylim(-3, 3)
= limits the range of y variable to a specific range
+ coord_cartesian(ylim(-3, 3))
= this will limit the visible range but plot all points of the dataggplot2
Comprehensive Example# initiates ggplot
g <- ggplot(maacs, aes(logpm25, NocturnalSympt))
g + geom_point(alpha = 1/3) # adds points
+ facet_wrap(bmicat ~ no2dec, nrow = 2, ncol = 4) # make panels
+ geom_smooth(method="lm", se=FALSE, col="steelblue") # adds smoother
+ theme_bw(base_family = "Avenir", base_size = 10) # change theme
+ labs(x = expression("log " * PM[2.5]) # add labels
+ labs(y = "Nocturnal Symptoms”)
+ labs(title = "MAACS Cohort”)
hclust
function)dist(data.frame(x=x, y=y)
= returns pair wise distances for all of the (x,y) coordinatesdist()
function uses Euclidean distance by default method = "complete"
or "average"
in hclust()
functionhclust
function with same parameters and the same data will produce the same plothclust
Function and Examplehh <- hclust(dist(dataFrame))
function = produces a hierarchical cluster object based on pair wise distances from a data frame of x and y values
dist()
= defaults to Euclidean, calculates the distance/similarity between two observations; when applied to a data frame, the function applies the \(\sqrt{(A_1 - A_2)^2 + (B_1 - B_2)^2 + ... + (Z_1 -Z_2)^2 }\) formula to every pair of rows of data to construct a matrix of distances between the roes
plot(hh)
= plots the dendrogramnames(hh)
= returns all parameters of the hclust
object
hh$order
= returns the order of the rows/clusters from the dendrogramhh$dist.method
= returns method for calculating distance/similarityhclust
Exampleset.seed(1234)
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
dataFrame <- data.frame(x=x,y=y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
plot(hClustering)
myplcclust
Function and Examplemyplcclust
= a function to plot hclust
objects in color (clusters labeled 1 2 3 etc.), but must know how many clusters there are initially myplclust <- function(hclust, lab = hclust$labels,
lab.col = rep(1, length(hclust$labels)), hang = 0.1, ...) {
## modifiction of plclust for plotting hclust objects *in colour*! Copyright
## Eva KF Chan 2009 Arguments: hclust: hclust object lab: a character vector
## of labels of the leaves of the tree lab.col: colour for the labels;
## NA=default device foreground colour hang: as in hclust & plclust Side
## effect: A display of hierarchical cluster with coloured leaf labels.
y <- rep(hclust$height, 2)
x <- as.numeric(hclust$merge)
y <- y[which(x < 0)]
x <- x[which(x < 0)]
x <- abs(x)
y <- y[order(x)]
x <- x[order(x)]
plot(hclust, labels = FALSE, hang = hang, ...)
text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order],
col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)
}
# example
dataFrame <- data.frame(x = x, y = y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
myplclust(hClustering, lab = rep(1:3, each = 4), lab.col = rep(1:3, each = 4))
heatmap
Function and Exampleheatmap(data.matrix)
function = similar to image(t(x))
set.seed(12345)
data <- matrix(rnorm(400), nrow = 40)
heatmap(data)
image
Function and Exampleimage(x, y, t(dataMatrix)[, nrow(dataMatrix):1])
= produces similar color grid plot as the heatmap()
without the dendrograms
t(dataMatrix)[, nrow(dataMatrix)]
t(dataMatrix)
= transpose of dataMatrix, this is such that the plot will be displayed in the same fashion as the matrix (rows as values on the y axis and columns as values on the x axis)
[, nrow(dataMatrix)]
= subsets the data frame in reverse column order; when combined with the t()
function, it reorders the rows of data from 40 to 1, such that the data from the matrix is displayed in order from top to bottom
x
, y
= used to specify the values displayed on the x and y axis
image(1:10, 1:40, t(data)[, nrow(data):1])
kmeans
function)set.seed(1234)
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
dataFrame <- data.frame(x=x,y=y)
# specifies initial number of clusters to be 3
kmeansObj <- kmeans(dataFrame,centers=3)
names(kmeansObj)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# returns cluster assignments
kmeansObj$cluster
## [1] 3 3 3 3 1 1 1 1 2 2 2 2
par(mar=rep(0.2,4))
plot(x,y,col=kmeansObj$cluster,pch=19,cex=2)
points(kmeansObj$centers,col=1:3,pch=3,cex=3,lwd=3)
for(i in 1:40){
# flip a coin
coinFlip <- rbinom(1,size=1,prob=0.5)
# if coin is heads add a common pattern to that row
if(coinFlip){
data[i,] <- data[i,] + rep(c(0,3),each=5)
}
}
# hierarchical clustering
hh <- hclust(dist(data))
dataOrdered <- data[hh$order,]
# create 1 x 3 panel plot
par(mfrow=c(1,3))
# heat map (sorted)
image(t(dataOrdered)[,nrow(dataOrdered):1])
# row means (40 rows)
plot(rowMeans(dataOrdered),40:1,,xlab="Row Mean",ylab="Row",pch=19)
# column means (10 columns)
plot(colMeans(dataOrdered),xlab="Column",ylab="Column Mean",pch=19)
s <- svd(data)
= performs SVD on data (\(n \times m\) matrix) and splits it into \(u\), \(v\), and \(d\) matrices
s$u
= \(n \times m\) matrix \(\rightarrow\) horizontal variations$d
= \(1 \times m\) vector \(\rightarrow\) vector of the singular/diagonal values
diag(s$d)
= \(m \times m\) diagonal matrixs$v
= \(m \times m\) matrix \(\rightarrow\) vertical variations$u %*% diag(s$d) %*% t(s$v)
= returns the original data \(\rightarrow\) \(X = UDV^T\)scale(data)
= scales the original data by subtracting each data point by its column mean and dividing by its column standard deviation# running svd
svd1 <- svd(scale(dataOrdered))
# create 1 by 3 panel plot
par(mfrow=c(1,3))
# data heatmap (sorted)
image(t(dataOrdered)[,nrow(dataOrdered):1])
# U Matrix - first column
plot(svd1$u[,1],40:1,,xlab="Row",ylab="First left singular vector",pch=19)
# V vector - first column
plot(svd1$v[,1],xlab="Column",ylab="First right singular vector",pch=19)
s$d
vector) captures the singular values, or variation in data that is explained by that particular component (variable/column/dimension)# create 1 x 2 panel plot
par(mfrow=c(1,2))
# plot singular values
plot(svd1$d,xlab="Column",ylab="Singular value",pch=19)
# plot proportion of variance explained
plot(svd1$d^2/sum(svd1$d^2),xlab="Column",ylab="Prop. of variance explained",pch=19)
p <- prcomp(data, scale = TRUE)
= performs PCA on data specified
scale = TRUE
= scales the data before performing PCAprcomp
objectsummary(p)
= prints out the principal component’s standard deviation, proportion of variance, and cumulative proportion# SVD
svd1 <- svd(scale(dataOrdered))
# PCA
pca1 <- prcomp(dataOrdered,scale=TRUE)
# Plot the rotation from PCA (Principal Components) vs v vector from SVD
plot(pca1$rotation[,1],svd1$v[,1],pch=19,xlab="Principal Component 1",
ylab="Right Singular Vector 1")
abline(c(0,1))
# summarize PCA
summary(pca1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9930 1.2518 1.0905 1.0024 0.90836 0.72211 0.60630
## Proportion of Variance 0.3972 0.1567 0.1189 0.1005 0.08251 0.05214 0.03676
## Cumulative Proportion 0.3972 0.5539 0.6728 0.7733 0.85582 0.90797 0.94473
## PC8 PC9 PC10
## Standard deviation 0.52145 0.40286 0.34423
## Proportion of Variance 0.02719 0.01623 0.01185
## Cumulative Proportion 0.97192 0.98815 1.00000
set.seed(678910)
# setting pattern
data <- matrix(rnorm(400), nrow = 40)
for(i in 1:40){
# flip a coin
coinFlip1 <- rbinom(1,size=1,prob=0.5)
coinFlip2 <- rbinom(1,size=1,prob=0.5)
# if coin is heads add a common pattern to that row
if(coinFlip1){
data[i,] <- data[i,] + rep(c(0,5),each=5)
}
if(coinFlip2){
data[i,] <- data[i,] + rep(c(0,5),5)
}
}
hh <- hclust(dist(data)); dataOrdered <- data[hh$order,]
# perform SVD
svd2 <- svd(scale(dataOrdered))
par(mfrow=c(2,3))
image(t(dataOrdered)[,nrow(dataOrdered):1])
plot(rep(c(0,1),each=5),pch=19,xlab="Column", main="True Pattern 1")
plot(rep(c(0,1),5),pch=19,xlab="Column",main="True Pattern 2")
image(t(dataOrdered)[,nrow(dataOrdered):1])
plot(svd2$v[,1],pch=19,xlab="Column",ylab="First right singular vector",
main="Detected Pattern 1")
plot(svd2$v[,2],pch=19,xlab="Column",ylab="Second right singular vector",
main="Detected Pattern 2")
NA
valuesimpute
package from Bioconductor can help approximate missing values from surrounding values
impute.knn
function takes the missing row and imputes the data using the k
nearest neighbors to that row
k=10
= default value (take the nearest 10 rows)library(impute) ## Available from http://bioconductor.org
data2 <- dataOrdered
# set random samples = NA
data2[sample(1:100,size=40,replace=FALSE)] <- NA
data2 <- impute.knn(data2)$data
svd1 <- svd(scale(dataOrdered)); svd2 <- svd(scale(data2))
par(mfrow=c(1,2))
plot(svd1$v[,1],pch=19, main="Original")
plot(svd2$v[,1],pch=19, main="Imputed")
# load faceData
load("figures/face.rda")
# perform SVD
svd3 <- svd(scale(faceData))
plot(svd3$d^2/sum(svd3$d^2),pch=19,xlab="Singular vector",ylab="Variance explained")
approx1 <- svd3$u[,1] %*% t(svd3$v[,1]) * svd3$d[1]
approx5 <- svd3$u[,1:5] %*% diag(svd3$d[1:5])%*% t(svd3$v[,1:5])
approx10 <- svd3$u[,1:10] %*% diag(svd3$d[1:10])%*% t(svd3$v[,1:10])
# create 1 x 4 panel plot
par(mfrow=c(1,4))
# plot original facedata
image(t(approx1)[,nrow(approx1):1], main = "1 Component")
image(t(approx5)[,nrow(approx5):1], main = "5 Component")
image(t(approx10)[,nrow(approx10):1], main = "10 Component")
image(t(faceData)[,nrow(faceData):1], main = "Original")
grDevices
Packagecolors()
function = lists names of colors available in any plotting functioncolorRamp
function
gray
function)pal <- colorRamp(c("red", "blue"))
= defines a colorRamp
functionpal(0)
returns a 1 x 3 matrix containing values for RED, GREEN, and BLUE values that range from 0 to 255pal(seq(0, 1, len = 10))
returns a 10 x 3 matrix of 10 colors that range from RED to BLUE (two ends of spectrum defined in the object)# define colorRamp function
pal <- colorRamp(c("red", "blue"))
# create a color
pal(0.67)
## [,1] [,2] [,3]
## [1,] 84.15 0 170.85
colorRampPalette
function
heat.colors
or topo.colors
)pal <- colorRampPalette(c("red", "yellow"))
defines a colorRampPalette
functionpal(10)
returns 10 interpolated colors in hexadecimal format that range between the defined ends of spectrum# define colorRampPalette function
pal <- colorRampPalette(c("red", "yellow"))
# create 10 colors
pal(10)
## [1] "#FF0000" "#FF1C00" "#FF3800" "#FF5500" "#FF7100" "#FF8D00" "#FFAA00"
## [8] "#FFC600" "#FFE200" "#FFFF00"
rgb
function
red
, green
, and blue
arguments = values between 0 and 1alpha = 0.5
= transparency control, values between 0 and 1plot
/image
commandscolorspace
package cna
be used for different control over colorsx <- rnorm(200); y <- rnorm(200)
par(mfrow=c(1,2))
# normal scatter plot
plot(x, y, pch = 19, main = "Default")
# using transparency shows data much better
plot(x, y, col = rgb(0, 0, 0, 0.2), main = "With Transparency")
RColorBrewer
Packagelibrary(RColorBrewer)
RColorBrewer
package can be used by colorRamp
and colorRampPalette
functionsbrewer.pal(n, "BuGn")
function
n
= number of colors to generated"BuGn"
= name of palette
?brewer.pal
list all available palettes to usen
hexadecimal colorslibrary(RColorBrewer)
# generate 3 colors using brewer.pal function
cols <- brewer.pal(3, "BuGn")
pal <- colorRampPalette(cols)
par(mfrow=c(1,3))
# heat.colors/default
image(volcano, main = "Heat.colors/Default")
# topographical colors
image(volcano, col = topo.colors(20), main = "Topographical Colors")
# RColorBrewer colors
image(volcano, col = pal(20), main = "RColorBrewer Colors")
smoothScatter
function
RColorBrewer
packagex <- rnorm(10000); y <- rnorm(10000)
smoothScatter(x, y)
Loading Training Set of Samsung S2 Data from UCI Repository
# load data frame provided
load("samsungData.rda")
# table of 6 types of activities
table(samsungData$activity)
##
## laying sitting standing walk walkdown walkup
## 1407 1286 1374 1226 986 1073
Plotting Average Acceleration for First Subject
# set up 1 x 2 panel plot
par(mfrow=c(1, 2), mar = c(5, 4, 1, 1))
# converts activity to a factor variable
samsungData <- transform(samsungData, activity = factor(activity))
# find only the subject 1 data
sub1 <- subset(samsungData, subject == 1)
# plot mean body acceleration in X direction
plot(sub1[, 1], col = sub1$activity, ylab = names(sub1)[1],
main = "Mean Body Acceleration for X")
# plot mean body acceleration in Y direction
plot(sub1[, 2], col = sub1$activity, ylab = names(sub1)[2],
main = "Mean Body Acceleration for Y")
# add legend
legend("bottomright",legend=unique(sub1$activity),col=unique(sub1$activity), pch = 1)
Clustering Based on Only Average Acceleration
# load myplclust function
source("myplclust.R")
# calculate distance matrix
distanceMatrix <- dist(sub1[,1:3])
# form hclust object
hclustering <- hclust(distanceMatrix)
# run myplclust on data
myplclust(hclustering, lab.col = unclass(sub1$activity))
Plotting Max Acceleration for the First Subject
# create 1 x 2 panel
par(mfrow=c(1,2))
# plot max accelecrations in x and y direction
plot(sub1[,10],pch=19,col=sub1$activity,ylab=names(sub1)[10],
main = "Max Body Acceleration for X")
plot(sub1[,11],pch=19,col = sub1$activity,ylab=names(sub1)[11],
main = "Max Body Acceleration for Y")
Clustering Based on Maximum Acceleration
# calculate distance matrix for max distances
distanceMatrix <- dist(sub1[,10:12])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering,lab.col=unclass(sub1$activity))
Singular Value Decomposition
# perform SVD minus last two columns (subject and activity)
svd1 = svd(scale(sub1[,-c(562,563)]))
# create 1 x 2 panel plot
par(mfrow=c(1,2))
# plot first two left singular vector
# separate moving from non moving
plot(svd1$u[,1],col=sub1$activity,pch=19, main = "First Left Singular Vector")
plot(svd1$u[,2],col=sub1$activity,pch=19, main = "Second Left Singular Vector")
New Clustering with Maximum Contributers
# find the max contributing feature
maxContrib <- which.max(svd1$v[,2])
# recalculate distance matrix
distanceMatrix <- dist(sub1[, c(10:12,maxContrib)])
hclustering <- hclust(distanceMatrix)
myplclust(hclustering,lab.col=unclass(sub1$activity))
# name of max contributing factor
names(samsungData)[maxContrib]
## [1] "fBodyAcc.meanFreq...Z"
K-means Clustering (nstart=1, first try)
# specify 6 centers for data
kClust <- kmeans(sub1[,-c(562,563)],centers=6)
# tabulate 6 clusteres against 6 activity but many clusters contain multiple activities
table(kClust$cluster,sub1$activity)
##
## laying sitting standing walk walkdown walkup
## 1 0 0 0 95 0 0
## 2 16 12 7 0 0 0
## 3 24 33 46 0 0 0
## 4 10 2 0 0 0 0
## 5 0 0 0 0 49 0
## 6 0 0 0 0 0 53
K-means clustering (nstart=100, first try)
# run k-means algorithm 100 times
kClust <- kmeans(sub1[,-c(562,563)],centers=6,nstart=100)
# tabulate results
table(kClust$cluster,sub1$activity)
##
## laying sitting standing walk walkdown walkup
## 1 0 37 51 0 0 0
## 2 3 0 0 0 0 53
## 3 18 10 2 0 0 0
## 4 0 0 0 0 49 0
## 5 29 0 0 0 0 0
## 6 0 0 0 95 0 0
K-means clustering (nstart=100, second try)
# run k-means algorithm 100 times
kClust <- kmeans(sub1[,-c(562,563)],centers=6,nstart=100)
# tabulate results
table(kClust$cluster,sub1$activity)
##
## laying sitting standing walk walkdown walkup
## 1 18 10 2 0 0 0
## 2 0 0 0 0 49 0
## 3 29 0 0 0 0 0
## 4 0 37 51 0 0 0
## 5 0 0 0 95 0 0
## 6 3 0 0 0 0 53
Cluster 1 Variable Centers (Laying)
# plot first 10 centers of k-means for laying to understand which features drive the activity
plot(kClust$center[1,1:10],pch=19,ylab="Cluster Center",xlab="")
Cluster 2 Variable Centers (Walking)
# plot first 10 centers of k-means for laying to understand which features drive the activity
plot(kClust$center[4,1:10],pch=19,ylab="Cluster Center",xlab="")
Read Raw Data from 1999 and 2012
# read in raw data from 1999
pm0 <- read.table("pm25_data/RD_501_88101_1999-0.txt", comment.char = "#", header = FALSE, sep = "|", na.strings = "")
# read in headers/column lables
cnames <- readLines("pm25_data/RD_501_88101_1999-0.txt", 1)
# convert string into vector
cnames <- strsplit(substring(cnames, 3), "|", fixed = TRUE)
# make vector the column names
names(pm0) <- make.names(cnames[[1]])
# we are interested in the pm2.5 readings in the "Sample.Value" column
x0 <- pm0$Sample.Value
# read in the data from 2012
pm1 <- read.table("pm25_data/RD_501_88101_2012-0.txt", comment.char = "#", header = FALSE, sep = "|",
na.strings = "", nrow = 1304290)
# make vector the column names
names(pm1) <- make.names(cnames[[1]])
# take the 2012 data for pm2.5 readings
x1 <- pm1$Sample.Value
Summaries for Both Periods
# generate 6 number summaries
summary(x1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -10.00 4.00 7.63 9.14 12.00 909.00 73133
summary(x0)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 7.20 11.50 13.74 17.90 157.10 13217
# calculate % of missing values, Are missing values important here?
data.frame(NA.1990 = mean(is.na(x0)), NA.2012 = mean(is.na(x1)))
## NA.1990 NA.2012
## 1 0.1125608 0.05607125
Make a boxplot of both 1999 and 2012
par(mfrow = c(1,2))
# regular boxplot, data too right skewed
boxplot(x0, x1, main = "Regular Boxplot")
# log boxplot, significant difference in means, but more spread
boxplot(log10(x0), log10(x1), main = "log Boxplot")
Check for Negative Values in ‘x1’
# summary again
summary(x1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -10.00 4.00 7.63 9.14 12.00 909.00 73133
# create logical vector for
negative <- x1 < 0
# count number of negatives
sum(negative, na.rm = T)
## [1] 26474
# calculate percentage of negatives
mean(negative, na.rm = T)
## [1] 0.0215034
# capture the date data
dates <- pm1$Date
dates <- as.Date(as.character(dates), "%Y%m%d")
# plot the histogram
hist(dates, "month") ## Check what's going on in months 1--6
Check Same New York Monitors at 1999 and 2012
# find unique monitors in New York in 1999
site0 <- unique(subset(pm0, State.Code == 36, c(County.Code, Site.ID)))
# find unique monitors in New York in 2012
site1 <- unique(subset(pm1, State.Code == 36, c(County.Code, Site.ID)))
# combine country codes and siteIDs of the monitors
site0 <- paste(site0[,1], site0[,2], sep = ".")
site1 <- paste(site1[,1], site1[,2], sep = ".")
# find common monitors in both
both <- intersect(site0, site1)
# print common monitors in 1999 and 2012
print(both)
## [1] "1.5" "1.12" "5.80" "13.11" "29.5" "31.3" "63.2008"
## [8] "67.1015" "85.55" "101.3"
Find how many observations available at each monitor
# add columns for combined county/site for the original data
pm0$county.site <- with(pm0, paste(County.Code, Site.ID, sep = "."))
pm1$county.site <- with(pm1, paste(County.Code, Site.ID, sep = "."))
# find subsets where state = NY and county/site = what we found previously
cnt0 <- subset(pm0, State.Code == 36 & county.site %in% both)
cnt1 <- subset(pm1, State.Code == 36 & county.site %in% both)
# split data by the county/size values and count oberservations
sapply(split(cnt0, cnt0$county.site), nrow)
## 1.12 1.5 101.3 13.11 29.5 31.3 5.80 63.2008 67.1015
## 61 122 152 61 61 183 61 122 122
## 85.55
## 7
sapply(split(cnt1, cnt1$county.site), nrow)
## 1.12 1.5 101.3 13.11 29.5 31.3 5.80 63.2008 67.1015
## 31 64 31 31 33 15 31 30 31
## 85.55
## 31
Choose Monitor where County = 63 and Side ID = 2008
# filter data by state/county/siteID
pm1sub <- subset(pm1, State.Code == 36 & County.Code == 63 & Site.ID == 2008)
pm0sub <- subset(pm0, State.Code == 36 & County.Code == 63 & Site.ID == 2008)
# there are 30 observations from 2012, and 122 from 1999
dim(pm1sub)
## [1] 30 29
dim(pm0sub)
## [1] 122 29
Plot Data for 2012
# capture the dates of the subset of data
dates1 <- pm1sub$Date
# capture measurements for the subset of data
x1sub <- pm1sub$Sample.Value
# convert dates to appropriate format
dates1 <- as.Date(as.character(dates1), "%Y%m%d")
# plot pm2.5 value vs time
plot(dates1, x1sub, main = "PM2.5 Polution Level in 2012")
Plot data for 1999
# capture the dates of the subset of data
dates0 <- pm0sub$Date
# convert dates to appropriate format
dates0 <- as.Date(as.character(dates0), "%Y%m%d")
# capture measurements for the subset of data
x0sub <- pm0sub$Sample.Value
# plot pm2.5 value vs time
plot(dates0, x0sub, main = "PM2.5 Polution Level in 1999")
Panel Plot for Both Years
# find max range for data
rng <- range(x0sub, x1sub, na.rm = T)
# create 1 x 2 panel plot
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1))
# plot time series plot for 1999
plot(dates0, x0sub, pch = 20, ylim = rng, main="Pollution in 1999")
# plot the median
abline(h = median(x0sub, na.rm = T))
# plot time series plot for 2012
plot(dates1, x1sub, pch = 20, ylim = rng, main="Pollution in 2012")
# plot the median
abline(h = median(x1sub, na.rm = T))
Find State-wide Means and Trend
# divide data by state and find tne mean of pollution level for 1999
mn0 <- with(pm0, tapply(Sample.Value, State.Code, mean, na.rm = T))
# divide data by state and find tne mean of pollution level for 1999
mn1 <- with(pm1, tapply(Sample.Value, State.Code, mean, na.rm = T))
# convert to data frames while preserving state names
d0 <- data.frame(state = names(mn0), mean = mn0)
d1 <- data.frame(state = names(mn1), mean = mn1)
# merge the 1999 and 2012 means by state
mrg <- merge(d0, d1, by = "state")
# dimension of combined data frame
dim(mrg)
## [1] 52 3
# first few lines of data
head(mrg)
## state mean.x mean.y
## 1 1 19.956391 10.126190
## 2 10 14.492895 11.236059
## 3 11 15.786507 11.991697
## 4 12 11.137139 8.239690
## 5 13 19.943240 11.321364
## 6 15 4.861821 8.749336
# plot the pollution levels data points for 1999
with(mrg, plot(rep(1, 52), mrg[, 2], xlim = c(.8, 2.2), ylim = c(3, 20),
main = "PM2.5 Pollution Level by State for 1999 & 2012",
xlab = "", ylab = "State-wide Mean PM"))
# plot the pollution levels data points for 2012
with(mrg, points(rep(2, 52), mrg[, 3]))
# connected the dots
segments(rep(1, 52), mrg[, 2], rep(2, 52), mrg[, 3])
# add 1999 and 2012 labels
axis(1, c(1, 2), c("1999", "2012"))