knitr
= alternative literate programming package
“Ask yourselves, what problem have you solved, ever, that was worth solving, where you knew all of the given information in advance? Where you didn’t have a surplus of information and have to filter it out, or you had insufficient information and have to go find some?” - Dan Myer
# If it isn't installed, install the kernlab package with install.packages()
library(kernlab)
data(spam)
set.seed(3435)
trainIndicator = rbinom(4601, size = 1, prob = 0.5)
table(trainIndicator)
## trainIndicator
## 0 1
## 2314 2287
trainSpam = spam[trainIndicator == 1, ]
testSpam = spam[trainIndicator == 0, ]
names()
, summary()
, head()
table(trainSpam$type)
plot(trainSpam$capitalAve ~ trainSpam$type)
plot(log10(trainSpam$capitalAve + 1) ~ trainSpam$type)
log
+1
plot(log10(trainSpam[, 1:4] + 1))
hCluster = hclust(dist(t(trainSpam[, 1:57])))
hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))
trainSpam$numType = as.numeric(trainSpam$type) - 1
costFunction = function(x, y) sum(x != (y > 0.5))
cvError = rep(NA, 55)
library(boot)
for (i in 1:55) {
# creates formula with one variable and the result
lmFormula = reformulate(names(trainSpam)[i], response = "numType")
glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
# cross validated error
cvError[i] = cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]
}
# Which predictor has minimum cross-validated error?
names(trainSpam)[which.min(cvError)]
## [1] "charDollar"
# Use the best model from the group
predictionModel = glm(numType ~ charDollar,family="binomial",data=trainSpam)
# Get predictions on the test set
predictionTest = predict(predictionModel,testSpam)
predictedSpam = rep("nonspam",dim(testSpam)[1])
# Classify as 'spam' for those with prob > 0.5
predictedSpam[predictionModel$fitted > 0.5] = "spam"
# Classification table
table(predictedSpam, testSpam$type)
##
## predictedSpam nonspam spam
## nonspam 1346 458
## spam 61 449
# Error rate
(61 + 458)/(1346 + 458 + 61 + 449)
## [1] 0.2242869
knitr
knitr
*text*
**text**
#Heading
##Heading
###Heading
- first element
1. first element
[text](url)
[text][1]
\(\rightarrow\) later in the document, define all of the links in this format: [1]: url "text"
knitr
package
========
= indicates title of document (large text)$expression$
= indicates LaTeX expression/formattingtext
` = changes text to code format (typewriter font){r name, echo = FALSE, results = hide}...
``` = R code chunk
name
= name of the code chunkecho = FALSE
= turns off the echo of the R code chunk, which means display only the resultresults = hide
= hides the results from being placed in the markdown documentr
variable
` = prints the value of that variable directly inline with the text{r scatterplot, fig.height = 4, fig.width = 6} ... plot() ...
``` = inserts a plot into markdown document
scatterplot
= name of this code chunk (can be anything)fig.height = 4
= adjusts height of the figure, specifying this alone will produce a rectangular plot rather than a square one by defaultfig.width = 6
= adjusts width of the figureinstall.packages("xtable")
)
xtable
prints the table in html format, which is better presented than plain text normallylibrary(datasets)
library(xtable)
fit <- lm(Ozone ~ Wind + Temp + Solar.R, data = airquality)
xt <- xtable(summary(fit))
print(xt, "html")
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -64.3421 | 23.0547 | -2.79 | 0.0062 |
Wind | -3.3336 | 0.6544 | -5.09 | 0.0000 |
Temp | 1.6521 | 0.2535 | 6.52 | 0.0000 |
Solar.R | 0.0598 | 0.0232 | 2.58 | 0.0112 |
{r setoptions, echo = FALSE} opt_chunk$set(echo = FALSE, results = "hide")
``` = sets the default option to not print the code/results unless otherwise specifiedresults = "asis"
OR "hide"
"asis"
= output to stay in original format and not compiled into HTMLecho = TRUE
OR FALSE
fig.height = numeric
fig.width = numeric
cache = TRUE
library(knitr)
setwd(<working directory>)
knit2html("document.Rmd")
browseURL("document.html")
knitr
documents
knitr
processes file to Markdown (.md) \(\rightarrow\) knitr
converts file to HTMLknitr
download.file("url", "filename")
= convenient way to download file
sessionInfo()
= prints R version, operating system, local, base/attached/utilized packagesset.seed()
can be used to specify seet for random generator in Rcacher
Packagecacher
package parses R source files and creates necessary cache directories/subdirectoriescache = TRUE
function for knitr
)cachepackage
function creates cacher
package storing
cacher
packageclone(id = "####")
= loads data from cache
showfiles()
= lists R scripts available in cachesourcefile("name.R")
= loads cached R filecode()
= prints the content of the R file line by linegraphcode()
= plots a graph to demonstrate dependencies/structure of codeobjectcode("object")
= shows lines of code that were used to generate that specific object (tracing all the way back to reading data)runcode()
= executes code by loading data from cached database (much faster than regular)
checkcode()
= evaluates all expressions from scratch
checkobjects()
= check for integrity of data objects (i.e. see if there are possible data corruption)loadcache()
= loads pointers to data objects in the data base
library(cacher)
clonecache(id = "092dcc7dda4b93e42f23e038a60e1d44dbec7b3f")
clonecache(id = "092d") ## effectively the same as above
# output: created cache directory '.cache'
showfiles() # show files stored in cache
# output: [1] "top20.R"
sourcefile("top20.R") # load R script
code() # examine the content of the code
# output:
# source file: top20.R
# 1 cities <- readLines("citylist.txt")
# 2 classes <- readLines("colClasses.txt")
# 3 vars <- c("date", "dow", "death",
# 4 data <- lapply(cities, function(city) {
# 5 names(data) <- cities
# 6 estimates <- sapply(data, function(city) {
# 7 effect <- weighted.mean(estimates[1,
# 8 stderr <- sqrt(1/sum(1/estimates[2,
graphcode() # generate graph showing structure of code
objectcode(“data”)
# output:
# source file: top20.R
# 1 cities <- readLines("citylist.txt")
# 2 classes <- readLines("colClasses.txt")
# 3 vars <- c("date", "dow", "death", "tmpd", "rmtmpd", "dptp", "rmdptp", "l1pm10tmean")
# 4 data <- lapply(cities, function(city) {
# filename <- file.path("data", paste(city, "csv", sep = "."))
# d0 <- read.csv(filename, colClasses = classes, nrow = 5200)
# d0[, vars]
# })
# 5 names(data) <- cities
loadcache()
ls()
# output:
# [1] "cities" "classes" "data" "effect"
# [5] "estimates" "stderr" "vars"
cities
# output:
# / transferring cache db file b8fd490bcf1d48cd06...
# [1] "la" "ny" "chic" "dlft" "hous" "phoe"
# [7] "staa" "sand" "miam" "det" "seat" "sanb"
# [13] "sanj" "minn" "rive" "phil" "atla" "oakl"
# [19] "denv" "clev"
effect
# output:
# / transferring cache db file 584115c69e5e2a4ae5...
# [1] 0.0002313219
stderr
# output:
# / transferring cache db file 81b6dc23736f3d72c6...
# [1] 0.000052457