caret
Package (tutorial)# load data
library(kernlab); data(spam); set.seed(333)
# picking a small subset (10 values) from spam data set
smallSpam <- spam[sample(dim(spam)[1],size=10),]
# label spam = 2 and ham = 1
spamLabel <- (smallSpam$type=="spam")*1 + 1
# plot the capitalAve values for the dataset with colors differentiated by spam/ham (2 vs 1)
plot(smallSpam$capitalAve,col=spamLabel)
# first rule (over-fitting to capture all variation)
rule1 <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 2.7] <- "spam"
prediction[x < 2.40] <- "nonspam"
prediction[(x >= 2.40 & x <= 2.45)] <- "spam"
prediction[(x > 2.45 & x <= 2.70)] <- "nonspam"
return(prediction)
}
# tabulate results of prediction algorithm 1 (in sample error -> no error in this case)
table(rule1(smallSpam$capitalAve),smallSpam$type)
##
## nonspam spam
## nonspam 5 0
## spam 0 5
# second rule (simple, setting a threshold)
rule2 <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 2.8] <- "spam"
prediction[x <= 2.8] <- "nonspam"
return(prediction)
}
# tabulate results of prediction algorithm 2(in sample error -> 10% in this case)
table(rule2(smallSpam$capitalAve),smallSpam$type)
##
## nonspam spam
## nonspam 5 1
## spam 0 4
# tabulate out of sample error for algorithm 1
table(rule1(spam$capitalAve),spam$type)
##
## nonspam spam
## nonspam 2141 588
## spam 647 1225
# tabulate out of sample error for algorithm 2
table(rule2(spam$capitalAve),spam$type)
##
## nonspam spam
## nonspam 2224 642
## spam 564 1171
# accuracy and total correct for algorithm 1 and 2
rbind("Rule 1" = c(Accuracy = mean(rule1(spam$capitalAve)==spam$type),
"Total Correct" = sum(rule1(spam$capitalAve)==spam$type)),
"Rule 2" = c(Accuracy = mean(rule2(spam$capitalAve)==spam$type),
"Total Correct" = sum(rule2(spam$capitalAve)==spam$type)))
## Accuracy Total Correct
## Rule 1 0.7315801 3366
## Rule 2 0.7378831 3395
\(\pagebreak\)
\(\pagebreak\)
\(\pagebreak\)
\(\pagebreak\)
\(\pagebreak\)
caret
Package (tutorial)preProcess()
createDataPartition()
, createResample()
, createTimeSlices()
train()
, predict()
confusionMatrix()
caret
package
caret
provides uniform framework to build/predict using different models
caret
package allows algorithms to be run the same way through predict()
functioncreateDataPartition(y=data$var, times=1, p=0.75, list=FALSE)
\(\rightarrow\) creates data partitions using given variable
y=data$var
= specifies what outcome/variable to split the data ontimes=1
= specifies number of partitions to create (number of data splitting performed)p=0.75
= percent of data that will be for training the modellist=FALSE
= returns a matrix of indices corresponding to p
% of the data (training set)
list = FALSE
is generally what is used list=TRUE
= returns a list of indices corresponding to p
% of the data (training set)training<-data[inTrain, ]
= subsets the data to training set onlytesting<-data[-inTrain, ]
= the rest of the data set can then be stored as the test set# load packages and data
library(caret)
# create training set indexes with 75% of data
inTrain <- createDataPartition(y=spam$type,p=0.75, list=FALSE)
# subset spam data to training
training <- spam[inTrain,]
# subset spam data (the rest) to test
testing <- spam[-inTrain,]
# dimension of original and training dataset
rbind("original dataset" = dim(spam),"training set" = dim(training))
## [,1] [,2]
## original dataset 4601 58
## training set 3451 58
createFolds(y=data$var, k=10, list=TRUE, returnTrain=TRUE)
= slices the data in to \(k\) folds for cross validation and returns \(k\) lists of indices
y=data$var
= specifies what outcome/variable to split the data onk=10
= specifies number of folds to create (See K Fold Cross Validation)
list=TRUE
= returns k
list of indices that corresponds to the cross-validated sets
k
datasets/vectors of indices, so list=TRUE
is generally what is used folds
in the case), you can use folds[[1]][1:10]
to access different elements from that listlist=FALSE
= returns a vector indicating which of the k
folds each data point belongs to (i.e. 1 - 10 is assigned for each of the data points in this case)
list=T
] returnTrain=TRUE
= returns the indices of the training sets
returnTrain=FALSE
= returns indices of the test sets# create 10 folds for cross validation and return the training set indices
folds <- createFolds(y=spam$type,k=10,list=TRUE,returnTrain=TRUE)
# structure of the training set indices
str(folds)
## List of 10
## $ Fold01: int [1:4141] 1 2 3 4 6 7 8 9 10 12 ...
## $ Fold02: int [1:4140] 1 2 3 4 5 7 8 9 10 11 ...
## $ Fold03: int [1:4141] 1 3 4 5 6 7 8 9 10 11 ...
## $ Fold04: int [1:4141] 2 3 4 5 6 7 8 9 10 11 ...
## $ Fold05: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold06: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold07: int [1:4141] 1 2 3 4 5 6 8 9 11 12 ...
## $ Fold08: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold09: int [1:4141] 1 2 4 5 6 7 10 11 12 13 ...
## $ Fold10: int [1:4141] 1 2 3 5 6 7 8 9 10 11 ...
# return the test set indices instead
# note: returnTrain = FALSE is unnecessary as it is the default behavior
folds.test <- createFolds(y=spam$type,k=10,list=TRUE,returnTrain=FALSE)
str(folds.test)
## List of 10
## $ Fold01: int [1:460] 15 16 18 40 45 62 68 81 82 102 ...
## $ Fold02: int [1:459] 1 41 55 58 67 75 117 123 151 175 ...
## $ Fold03: int [1:461] 3 14 66 69 70 80 90 112 115 135 ...
## $ Fold04: int [1:460] 5 19 25 65 71 83 85 88 91 93 ...
## $ Fold05: int [1:460] 6 10 17 21 26 56 57 104 107 116 ...
## $ Fold06: int [1:459] 7 8 13 39 52 54 76 89 99 106 ...
## $ Fold07: int [1:461] 4 23 27 29 32 33 34 38 49 51 ...
## $ Fold08: int [1:460] 2 9 30 31 36 37 43 46 47 48 ...
## $ Fold09: int [1:461] 12 20 24 44 53 59 60 64 84 98 ...
## $ Fold10: int [1:460] 11 22 28 35 42 61 72 86 92 118 ...
# return first 10 elements of the first training set
folds[[1]][1:10]
## [1] 1 2 3 4 6 7 8 9 10 12
createResample(y=data$var, times=10, list=TRUE)
= create 10 resamplings from the given data with replacement
list=TRUE
= returns list of n vectors that contain indices of the sample
times=10
= number of samples to create# create 10 resamples
resamples <- createResample(y=spam$type,times=10,list=TRUE)
# structure of the resamples (note some samples are repeated)
str(resamples)
## List of 10
## $ Resample01: int [1:4601] 1 4 4 4 7 8 12 13 13 14 ...
## $ Resample02: int [1:4601] 3 3 5 7 10 12 12 13 13 14 ...
## $ Resample03: int [1:4601] 1 2 2 3 4 5 8 10 11 12 ...
## $ Resample04: int [1:4601] 1 3 3 4 7 8 8 9 10 14 ...
## $ Resample05: int [1:4601] 2 4 5 6 7 7 8 8 9 12 ...
## $ Resample06: int [1:4601] 3 6 6 7 8 9 12 13 13 14 ...
## $ Resample07: int [1:4601] 1 2 2 5 5 6 7 8 9 10 ...
## $ Resample08: int [1:4601] 2 2 3 4 4 7 7 8 8 9 ...
## $ Resample09: int [1:4601] 1 4 7 8 8 9 12 13 15 15 ...
## $ Resample10: int [1:4601] 1 3 4 4 7 7 9 9 10 11 ...
createTimeSlices(y=data, initialWindow=20, horizon=10)
= creates training sets with specified window length and the corresponding test sets
initialWindow=20
= number of consecutive values in each time slice/training set (i.e. values 1 - 20)horizon=10
= number of consecutive values in each predict/test set (i.e. values 21 - 30)fixedWindow=FALSE
= training sets always start at the first observation
# create time series data
tme <- 1:1000
# create time slices
folds <- createTimeSlices(y=tme,initialWindow=20,horizon=10)
# name of lists
names(folds)
## [1] "train" "test"
# first training set
folds$train[[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# first test set
folds$test[[1]]
## [1] 21 22 23 24 25 26 27 28 29 30
train(y ~ x, data=df, method="glm")
= function to apply the machine learning algorithm to construct model from training data# returns the arguments of the default train function
args(train.default)
## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
## metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric ==
## "RMSE", FALSE, TRUE), trControl = trainControl(), tuneGrid = NULL,
## tuneLength = 3)
## NULL
train
function has a large set of parameters, below are the default options
method="rf"
= default algorithm is random forest for training a given data set; caret
contains a large number of algorithms
names(getModelInfo())
= returns all the options for method
argumentpreProcess=NULL
= set preprocess options (see Preprocessing)weights=NULL
= can be used to add weights to observations, useful for unbalanced distribution (a lot more of one type than another)metric=ifelse(is.factor(y), "Accuracy", "RMSE")
= default metric for algorithm is Accuracy for factor variables, and RMSE, or root mean squared error, for continuous variables
maximize=ifelse(metric=="RMSE", FALSE, TRUE)
= the algorithm should maximize accuracy and minimize RMSEtrControl=trainControl()
= training controls for the model, more details belowtuneGrid=NULL
tuneLength=3
# returns the default arguments for the trainControl object
args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method),
## 10, 25), repeats = ifelse(grepl("cv", method), 1, number),
## p = 0.75, initialWindow = NULL, horizon = 1, fixedWindow = TRUE,
## verboseIter = FALSE, returnData = TRUE, returnResamp = "final",
## savePredictions = FALSE, classProbs = FALSE, summaryFunction = defaultSummary,
## selectionFunction = "best", preProcOptions = list(thresh = 0.95,
## ICAcomp = 3, k = 5), index = NULL, indexOut = NULL, timingSamps = 0,
## predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5,
## alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE,
## allowParallel = TRUE)
## NULL
trainControl
creates an object that sets many options for how the model will be applied to the training data
method="boot"
=
"boot"
= bootstrapping (drawing with replacement)"boot632"
= bootstrapping with adjustment"cv"
= cross validation"repeatedcv"
= repeated cross validation"LOOCV"
= leave one out cross validationnumber=ifelse(grepl("cv", method),10, 25)
= number of subsamples to take
number=10
= default for any kind of cross validationnumber=25
= default for bootstrappingnumber
should be increased when fine-tuning model with large number of parameter repeats=ifelse(grepl("cv", method), 1, number)
= numbers of times to repeat the subsampling
repeats=1
= default for any cross validation methodrepeats=25
= default for bootstrappingp=0.75
= default percentage of data to create training setsinitialWindow=NULL, horizon=1, fixedWindow=TRUE
= parameters for time series dataverboseIter=FALSE
= print the training logsreturnData=TRUE
, returnResamp = “final”,savePredictions=FALSE
= save the predictions for each resampleclassProbs=FALSE
= return classification probabilities along with the predictionssummaryFunction=defaultSummary
= default summary of the model,preProcOptions=list(thresh = 0.95, ICAcomp = 3, k = 5)
= specifies preprocessing options for the modelpredictionBounds=rep(FALSE, 2)
= specify the range of the predicted value
predictionBounds=c(10, NA)
would mean that any value lower than 10 would be treated as 10 and no upper boundsseeds=NA
= set the seed for the operation
train
function is run allowParallel=TRUE
= sets for parallel processing/computationsfeaturePlot(x=predictors, y=outcome, plot="pairs")
= short cut to plot the relationships between the predictors and outcomes# load relevant libraries
library(ISLR); library(ggplot2);
# load wage data
data(Wage)
# create training and test sets
inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
# plot relationships between the predictors and outcome
featurePlot(x=training[,c("age","education","jobclass")], y = training$wage,plot="pairs")
qplot(age, wage, color=eduction, data=training)
= can be used to separate data by a factor variable (by coloring the points differently)
geom_smooth(method = "lm")
= adds a regression line to the plotsgeom=c("boxplot", "jitter")
= specifies what kind of plot to produce, in this case both the boxplot and the point cloud# qplot plus linear regression lines
qplot(age,wage,colour=education,data=training)+geom_smooth(method='lm',formula=y~x)
cut2(variable, g=3)
= creates a new factor variable by cutting the specified variable into n groups (3 in this case) based on percentiles
cut2
function is part of the Hmisc
package, so library(Hmisc)
must be run first grid.arrange(p1, p2, ncol=2)
= ggplot2
function the print multiple graphs on the same plot
grid.arrange
function is part of the gridExtra
package, so library(gridExtra)
must be run first # load Hmisc and gridExtra packages
library(Hmisc);library(gridExtra);
# cute the wage variable
cutWage <- cut2(training$wage,g=3)
# plot the boxplot
p1 <- qplot(cutWage,age, data=training,fill=cutWage,
geom=c("boxplot"))
# plot boxplot and point clusters
p2 <- qplot(cutWage,age, data=training,fill=cutWage,
geom=c("boxplot","jitter"))
# plot the two graphs side by side
grid.arrange(p1,p2,ncol=2)
table(cutVariable, data$var2)
= tabulates the cut factor variable vs another variable in the dataset (ie; builds a contingency table using cross-classifying factors)prop.table(table, margin=1)
= converts a table to a proportion table
margin=1
= calculate the proportions based on the rowsmargin=2
= calculate the proportions based on the columns# tabulate the cutWage and jobclass variables
t <- table(cutWage,training$jobclass)
# print table
t
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 92.7) 445 256
## [ 92.7,119.1) 376 347
## [119.1,318.3] 269 409
# convert to proportion table based on the rows
prop.table(t,1)
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 92.7) 0.6348074 0.3651926
## [ 92.7,119.1) 0.5200553 0.4799447
## [119.1,318.3] 0.3967552 0.6032448
qplot(var1, color=var2, data=training, geom="density")
= produces density plot for the given numeric and factor variables
# produce density plot
qplot(wage,colour=education,data=training,geom="density")
test
set with the mean and standard deviation of the train
variables
train(y~x, data=training, preProcess=c("center", "scale"))
= preprocessing can be directly specified in the train
function
preProcess=c("center", "scale")
= normalize all predictors before constructing modelpreProcess(trainingData, method=c("center", "scale")
= function in the caret
to standardize data
preProcess
function as an object and apply it to the train
and test
sets using the predict
function# load spam data
data(spam)
# create train and test sets
inTrain <- createDataPartition(y=spam$type,p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
# create preProcess object for all predictors ("-58" because 58th = outcome)
preObj <- preProcess(training[,-58],method=c("center","scale"))
# normalize training set
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
# normalize test set using training parameters
testCapAveS <- predict(preObj,testing[,-58])$capitalAve
# compare results for capitalAve variable
rbind(train = c(mean = mean(trainCapAveS), std = sd(trainCapAveS)),
test = c(mean(testCapAveS), sd(testCapAveS)))
## mean std
## train 6.097035e-18 1.000000
## test 7.548133e-02 1.633866
preprocess(data, method="BoxCox")
= applies BoxCox transformations to continuous data to help normalize the variables through maximum likelihood
qqnorm(processedVar)
= can be used to produce the Q-Q plot which compares the theoretical quantiles with the sample quantiles to see the normality of the data# set up BoxCox transforms
preObj <- preProcess(training[,-58],method=c("BoxCox"))
# perform preprocessing on training data
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
# plot histogram and QQ Plot
# Note: the transformation definitely helped to
# normalize the data but it does not produce perfect result
par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
preProcess(data, method="knnImpute")
= impute/estimate the missing data using k nearest neighbors (knn) imputation
knnImpute
= takes the k nearest neighbors from the missing value and averages the value to impute the missing observations# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA] <- NA
# Impute and standardize
preObj <- preProcess(training[,-58],method="knnImpute")
capAve <- predict(preObj,training[,-58])$capAve
# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)
# compute differences between imputed values and true values
quantile(capAve - capAveTruth)
## 0% 25% 50% 75% 100%
## -1.656344e+00 2.377772e-05 1.286900e-03 1.881653e-03 3.174413e-01
\(\pagebreak\)
preProcess()
can be leveraged to handle creating new covariatesdummyVars(outcome~var, data=training)
= creates a dummy variable object that can be used through predict
function to create dummy variables
predict(dummyObj, newdata=training)
= creates appropriate columns to represent the factor variable with appropriate 0s and 1s
# setting up data
inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
# create a dummy variable object
dummies <- dummyVars(wage ~ jobclass,data=training)
# create the dummy variable columns
head(predict(dummies,newdata=training))
## jobclass.1. Industrial jobclass.2. Information
## 231655 1 0
## 86582 0 1
## 161300 1 0
## 155159 0 1
## 11443 0 1
## 376662 0 1
nearZeroVar(training, saveMetrics=TRUE)
= returns list of variables in training data set with information on frequency ratios, percent uniques, whether or not it has zero variance
freqRatio
= ratio of frequencies for the most common value over second most common valuepercentUnique
= percentage of unique data points out of total number of data pointszeroVar
= TRUE/FALSE indicating whether the predictor has only one distinct valuenzv
= TRUE/FALSE indicating whether the predictor is a near zero variance predictornzv
= TRUE, those variables should be thrown out # print nearZeroVar table
nearZeroVar(training,saveMetrics=TRUE)
## freqRatio percentUnique zeroVar nzv
## year 1.017647 0.33301618 FALSE FALSE
## age 1.231884 2.85442436 FALSE FALSE
## sex 0.000000 0.04757374 TRUE TRUE
## maritl 3.329571 0.23786870 FALSE FALSE
## race 8.480583 0.19029496 FALSE FALSE
## education 1.393750 0.23786870 FALSE FALSE
## region 0.000000 0.04757374 TRUE TRUE
## jobclass 1.070936 0.09514748 FALSE FALSE
## health 2.526846 0.09514748 FALSE FALSE
## health_ins 2.209160 0.09514748 FALSE FALSE
## logwage 1.011765 18.83920076 FALSE FALSE
## wage 1.011765 18.83920076 FALSE FALSE
splines
package] bs(data$var, df=3)
= creates 3 new columns corresponding to the var, var2, and var3 termsns()
and poly()
can also be used to generate polynomialsgam()
function can also be used and it allows for smoothing of multiple variables with different values for each variablepredict
function # load splines package
library(splines)
# create polynomial function
bsBasis <- bs(training$age,df=3)
# fit the outcome on the three polynomial terms
lm1 <- lm(wage ~ bsBasis,data=training)
# plot all age vs wage data
plot(training$age,training$wage,pch=19,cex=0.5)
# plot the fitted polynomial function
points(training$age,predict(lm1,newdata=training),col="red",pch=19,cex=0.5)
# predict on test values
head(predict(bsBasis,age=testing$age))
## 1 2 3
## [1,] 0.0000000 0.00000000 0.000000000
## [2,] 0.2368501 0.02537679 0.000906314
## [3,] 0.4163380 0.32117502 0.082587862
## [4,] 0.4308138 0.29109043 0.065560908
## [5,] 0.3625256 0.38669397 0.137491189
## [6,] 0.3063341 0.42415495 0.195763821
caret
package are computationally intensivedoMC
package is recommended to be used for caret
computations (reference)
doMC::registerDoMC(cores=4)
= registers 4 cores for R to utilize\(\pagebreak\)
prcomp
Functionpr<-prcomp(data)
= performs PCA on all variables and returns a prcomp
object that contains information about standard deviations and rotations
pr$rotations
= returns eigenvectors for the linear combinations of all variables (coefficients that variables are multiplied by to come up with the principal components) \(\rightarrow\) how the principal components are createdlog
transformation of the variables and adding 1 before performing PCA
plot(pr)
= plots the percent variation explained by the first 10 principal components (PC)
# load spam data
data(spam)
# perform PCA on dataset
prComp <- prcomp(log10(spam[,-58]+1))
# print out the eigenvector/rotations first 5 rows and PCs
head(prComp$rotation[, 1:5], 5)
## PC1 PC2 PC3 PC4 PC5
## make 0.019370409 0.0427855959 -0.01631961 0.02798232 -0.014903314
## address 0.010827343 0.0408943785 0.07074906 -0.01407049 0.037237531
## all 0.040923168 0.0825569578 -0.03603222 0.04563653 0.001222215
## num3d 0.006486834 -0.0001333549 0.01234374 -0.01005991 -0.001282330
## our 0.036963221 0.0941456085 -0.01871090 0.05098463 -0.010582039
# create new variable that marks spam as 2 and nospam as 1
typeColor <- ((spam$type=="spam")*1 + 1)
# plot the first two principal components
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
caret
Packagepp<-preProcess(log10(training[,-58]+1),method="pca",pcaComp=2,thresh=0.8))
= perform PCA with preProcess
function and returns the number of principal components that can capture the majority of the variation
preProcess
object that can be applied using predict
functionpcaComp=2
= specifies the number of principal components to compute (2 in this case)thresh=0.8
= threshold for variation captured by principal components
thresh=0.95
= default value, which returns the number of principal components that are needed to capture 95% of the variation in datapredict(pp, training)
= computes new variables for the PCs (2 in this case) for the training data set
predict
can then be used as data for the prediction model# create train and test sets
inTrain <- createDataPartition(y=spam$type,p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
# create preprocess object
preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
# calculate PCs for training data
trainPC <- predict(preProc,log10(training[,-58]+1))
# run model on outcome and principle components
modelFit <- train(training$type ~ .,method="glm",data=trainPC)
# calculate PCs for test data
testPC <- predict(preProc,log10(testing[,-58]+1))
# compare results
confusionMatrix(testing$type,predict(modelFit,testPC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 656 41
## spam 82 371
##
## Accuracy : 0.893
## 95% CI : (0.8737, 0.9103)
## No Information Rate : 0.6417
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7724
## Mcnemar's Test P-Value : 0.0003101
##
## Sensitivity : 0.8889
## Specificity : 0.9005
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.8190
## Prevalence : 0.6417
## Detection Rate : 0.5704
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.8947
##
## 'Positive' Class : nonspam
##
train
method
train(outcome ~ ., method="glm", preProcess="pca", data=training)
= performs PCA first on the training set and then runs the specified model
preProcess
\(\rightarrow\) predict
)# construct model
modelFit <- train(training$type ~ .,method="glm",preProcess="pca",data=training)
# print results of model
confusionMatrix(testing$type,predict(modelFit,testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 668 29
## spam 59 394
##
## Accuracy : 0.9235
## 95% CI : (0.9066, 0.9382)
## No Information Rate : 0.6322
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8379
## Mcnemar's Test P-Value : 0.001992
##
## Sensitivity : 0.9188
## Specificity : 0.9314
## Pos Pred Value : 0.9584
## Neg Pred Value : 0.8698
## Prevalence : 0.6322
## Detection Rate : 0.5809
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.9251
##
## 'Positive' Class : nonspam
##
\(\pagebreak\)
lm<-lm(y ~ x, data=train)
= runs a linear model of outcome y on predictor x \(\rightarrow\) univariate regression
summary(lm)
= returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p valueslm(y ~ x1+x2+x3, data=train)
= run linear model of outcome y on predictors x1, x2, and x3lm(y ~ ., data=train
= run linear model of outcome y on all predictorspredict(lm, newdata=df)
= use the constructed linear model to predict outcomes (\(\hat Y_i\)) for the new values
newdata
data frame must have the same variables (factors must have the same levels) as the training datanewdata=test
= predict outcomes for the test set based on linear regression model from the training# load data
data(faithful)
# create train and test sets
inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
trainFaith <- faithful[inTrain,]; testFaith <- faithful[-inTrain,]
# build linear model
lm1 <- lm(eruptions ~ waiting,data=trainFaith)
# print summary of linear model
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24867 -0.36292 0.00002 0.35768 1.19858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.165648 0.227486 -9.52 <2e-16 ***
## waiting 0.079396 0.003146 25.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5013 on 135 degrees of freedom
## Multiple R-squared: 0.8251, Adjusted R-squared: 0.8238
## F-statistic: 636.9 on 1 and 135 DF, p-value: < 2.2e-16
# predict eruptions for new waiting time
newdata <- data.frame(waiting=80)
predict(lm1,newdata)
## 1
## 4.186
# create 1 x 2 panel plot
par(mfrow=c(1,2))
# plot train data with the regression line
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",
ylab="Duration", main = "Train")
lines(trainFaith$waiting,predict(lm1),lwd=3)
# plot test data with the regression line
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue",xlab="Waiting",
ylab="Duration", main = "Test")
lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)
# Calculate RMSE on training and test sets
c(trainRMSE = sqrt(sum((lm1$fitted-trainFaith$eruptions)^2)),
testRMSE = sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2)))
## trainRMSE testRMSE
## 5.824859 5.788547
pi<-predict(lm, newdata=test, interval="prediction")
= returns 3 columns for fit
(predicted value, same as before), lwr
(lower bound of prediction interval), and upr
(upper bound of prediction interval)
matlines(x, pi, type="l")
= plots three lines, one for the linear fit and two for upper/lower prediction interval bounds# calculate prediction interval
pred1 <- predict(lm1,newdata=testFaith,interval="prediction")
# plot data points (eruptions, waiting)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue")
# plot fit line and prediction interval
matlines(testFaith$waiting,pred1,type="l",,col=c(1,2,2),lty = c(1,1,1), lwd=3)
lm <- train(y ~ x, method="lm", data=train)
= run linear model on the training data \(\rightarrow\) identical to lm
function
summary(lm$finalModel)
= returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p values \(\rightarrow\) identical to summary(lm)
for a lm
objecttrain(y ~ ., method="lm", data=train)
= run linear model on all predictors in training data
plot(lm$finalModel)
= construct 4 diagnostic plots for evaluating the model
?plot.lm
# create train and test sets
inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
# fit linear model for age jobclass and education
modFit<- train(wage ~ age + jobclass + education,method = "lm",data=training)
# store final model
finMod <- modFit$finalModel
# set up 2 x 2 panel plot
par(mfrow = c(2, 2))
# construct diagnostic plots for model
plot(finMod,pch=19,cex=0.5,col="#00000010")
# plot fitted values by residuals
qplot(finMod$fitted, finMod$residuals, color=race, data=training)
plot(finMod$residuals)
= plot the residuals against index (row number)# plot residual by index
plot(finMod$residuals,pch=19,cex=0.5)
\(\pagebreak\)
party
, rpart
, tree
packages can all build trees\[\hat{p}_{mk} = \frac{\sum_{i}^m \mathbb{1}(y_i = k)}{N_m}\]
\(N_m\) is the size of the group
example
# set margin and seed
par(mar=c(1,1,1,1), mfrow = c(1, 2)); set.seed(1234);
# simulate data
x = rep(1:4,each=4); y = rep(1:4,4)
# plot first scenario
plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",15),rep("red",1)),pch=19)
# plot second scenario
plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",8),rep("red",8)),pch=19)
caret
Packagetree<-train(y ~ ., data=train, method="rpart")
= constructs trees based on the outcome and predictors
rpart
object, which can be used to predict
new/test valuesprint(tree$finalModel)
= returns text summary of all nodes/splits in the tree constructedplot(tree$finalModel, uniform=TRUE)
= plots the classification tree with all nodes/splits
rattle
package] fancyRpartPlot(tree$finalModel)
= produces more readable, better formatted classification tree diagrams# load iris data set
data(iris)
# create test/train data sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
# fit classification tree as a model
modFit <- train(Species ~ .,method="rpart",data=training)
# print the classification tree
print(modFit$finalModel)
## n= 105
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.65 34 1 versicolor (0.00000000 0.97058824 0.02941176) *
## 7) Petal.Width>=1.65 36 2 virginica (0.00000000 0.05555556 0.94444444) *
# plot the classification tree
rattle::fancyRpartPlot(modFit$finalModel)
# predict on test values
predict(modFit,newdata=testing)
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica virginica virginica virginica virginica virginica
## [37] versicolor virginica virginica versicolor versicolor virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
most useful for non-linear models
loess(y ~ x, data=train, span=0.2)
= fits a smooth curve to data
span=0.2
= controls how smooth the curve should be# load data
library(ElemStatLearn); data(ozone,package="ElemStatLearn")
# reorder rows based on ozone variable
ozone <- ozone[order(ozone$ozone),]
# create empty matrix
ll <- matrix(NA,nrow=10,ncol=155)
# iterate 10 times
for(i in 1:10){
# create sample from data with replacement
ss <- sample(1:dim(ozone)[1],replace=T)
# draw sample from the dataa and reorder rows based on ozone
ozone0 <- ozone[ss,]; ozone0 <- ozone0[order(ozone0$ozone),]
# fit loess function through data (similar to spline)
loess0 <- loess(temperature ~ ozone,data=ozone0,span=0.2)
# prediction from loess curve for the same values each time
ll[i,] <- predict(loess0,newdata=data.frame(ozone=1:155))
}
# plot the data points
plot(ozone$ozone,ozone$temperature,pch=19,cex=0.5)
# plot each prediction model
for(i in 1:10){lines(1:155,ll[i,],col="grey",lwd=2)}
# plot the average in red
lines(1:155,apply(ll,2,mean),col="red",lwd=2)
caret
package, there are three options for the train
function to perform bagging
bagEarth
- Bagged MARS (documentation)treebag
- Bagged CART (documentation)bagFDA
- Bagged Flexible Discriminant Analysis (documentation)bag
functions can be constructed (documentation)
bag(predictors, outcome, B=10, bagControl(fit, predict, aggregate))
= define and execute custom bagging algorithm
B=10
= iterations/resampling to performbagControl()
= controls for how the bagging should be executed
fit=ctreeBag$fit
= the model ran on each resampling of datapredict=ctreeBag$predict
= how predictions should be calculated from each modelaggregate=ctreeBag$aggregate
= how the prediction models should be combined/averaged# load relevant package and data
library(party); data(ozone,package="ElemStatLearn")
# reorder rows based on ozone variable
ozone <- ozone[order(ozone$ozone),]
# extract predictors
predictors <- data.frame(ozone=ozone$ozone)
# extract outcome
temperature <- ozone$temperature
# run bagging algorithm
treebag <- bag(predictors, temperature, B = 10,
# custom bagging function
bagControl = bagControl(fit = ctreeBag$fit,
predict = ctreeBag$pred,
aggregate = ctreeBag$aggregate))
# plot data points
plot(ozone$ozone,temperature,col='lightgrey',pch=19)
# plot the first fit
points(ozone$ozone,predict(treebag$fits[[1]]$fit,predictors),pch=19,col="red")
# plot the aggregated predictions
points(ozone$ozone,predict(treebag,predictors),pch=19,col="blue")
\(\pagebreak\)
rf<-train(outcome ~ ., data=train, method="rf", prox=TRUE, ntree=500)
= runs random forest algorithm on the training data against all predictors
prox=TRUE
= the proximity measures between observations should be calculated (used in functions such as classCenter()
to find center of groups)
rf$finalModel$prox
= returns matrix of proximitiesntree=500
= specify number of trees that should be constructeddo.trace=TRUE
= prints logs as the trees are being built \(\rightarrow\) useful by indicating progress to userrandomForest()
function can be used to perform random forest algorithm (syntax is the same as train
) and is much faster getTree(rf$finalModel, k=2)
= return specific tree from random forest modelclassCenters(predictors, outcome, proximity, nNbr)
= return computes the cluster centers using the nNbr
nearest neighbors of the observations
prox = rf$finalModel$prox
= proximity matrix from the random forest modelnNbr
= number of nearest neighbors that should be used to compute cluster centerspredict(rf, test)
= apply the random forest model to test data set
confusionMatrix(predictions, actualOutcome)
= tabulates the predictions of the model against the truths
# load data
data(iris)
# create train/test data sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
# apply random forest
modFit <- train(Species~ .,data=training,method="rf",prox=TRUE)
# return the second tree (first 6 rows)
head(getTree(modFit$finalModel,k=2))
## left daughter right daughter split var split point status prediction
## 1 2 3 4 0.70 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 4 1.75 1 0
## 4 6 7 3 5.30 1 0
## 5 0 0 0 0.00 -1 3
## 6 8 9 3 4.95 1 0
# compute cluster centers
irisP <- classCenter(training[,c(3,4)], training$Species, modFit$finalModel$prox)
# convert irisP to data frame and add Species column
irisP <- as.data.frame(irisP); irisP$Species <- rownames(irisP)
# plot data points
p <- qplot(Petal.Width, Petal.Length, col=Species,data=training)
# add the cluster centers
p + geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
# predict outcome for test data set using the random forest model
pred <- predict(modFit,testing)
# logic value for whether or not the rf algorithm predicted correctly
testing$predRight <- pred==testing$Species
# tabulate results
table(pred,testing$Species)
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 2
## virginica 0 0 13
# plot data points with the incorrect classification highlighted
qplot(Petal.Width,Petal.Length,colour=predRight,data=testing,main="newdata Predictions")
\(\pagebreak\)
more detail tutorial can be found here
example
gbm <- train(outcome ~ variables, method="gbm", data=train, verbose=F)
= run boosting model on the given data
method
for boosting
gbm
- boosting with treesmboost
- model based boostingada
- statistical boosting based on additive logistic regressiongamBoost
for boosting generalized additive modelspredict
function can be used to apply the model to test data, similar to the rest of the algorithms in caret
package
example
# load data
data(Wage)
# remove log wage variable (we are trying to predict wage)
Wage <- subset(Wage,select=-c(logwage))
# create train/test data sets
inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
# run the gbm model
modFit <- train(wage ~ ., method="gbm",data=training,verbose=FALSE)
# print model summary
print(modFit)
## Stochastic Gradient Boosting
##
## 2102 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared RMSE SD Rsquared SD
## 1 50 35.64972 0.3317422 1.495165 0.02312183
## 1 100 34.95593 0.3429594 1.503223 0.02319651
## 1 150 34.84473 0.3451634 1.496016 0.02324176
## 2 50 34.91119 0.3462101 1.490004 0.02425481
## 2 100 34.74433 0.3487227 1.480423 0.02278780
## 2 150 34.74823 0.3487136 1.472941 0.02314265
## 3 50 34.83480 0.3467828 1.493846 0.02292988
## 3 100 34.85342 0.3449018 1.482881 0.02373242
## 3 150 34.99413 0.3401694 1.544378 0.02498133
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 100,
## interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.
\(\pagebreak\)
the predicted value for the outcome is therefore \(\hat{Y}(x) = argmax_k \delta_k(x)\)
lda<-train(outcome ~ predictors, data=training, method="lda")
= constructs a linear discriminant analysis model on the predictors with the provided training datapredict(lda, test)
= applies the LDA model to test data and return the prediction results in data framecaret
package# load data
data(iris)
# create training and test sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
# run the linear discriminant analysis on training data
lda <- train(Species ~ .,data=training,method="lda")
# predict test outcomes using LDA model
pred.lda <- predict(lda,testing)
# print results
pred.lda
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica virginica virginica virginica virginica virginica
## [37] virginica virginica virginica virginica virginica virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
nb <- train(outcome ~ predictors, data=training, method="nb")
= constructs a naive Bayes model on the predictors with the provided training datapredict(nb, test)
= applies the naive Bayes model to test data and return the prediction results in data framecaret
package# using the same data from iris, run naive Bayes on training data
nb <- train(Species ~ ., data=training,method="nb")
# predict test outcomes using naive Bayes model
pred.nb <- predict(nb,testing)
# print results
pred.nb
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica virginica virginica virginica virginica virginica
## [37] virginica virginica virginica virginica virginica virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
iris
data set, we can compare the prediction the results from the two models# tabulate the prediction results from LDA and naive Bayes
table(pred.lda,pred.nb)
## pred.nb
## pred.lda setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 0
## virginica 0 0 15
# create logical variable that returns TRUE for when predictions from the two models match
equalPredictions <- (pred.lda==pred.nb)
# plot the comparison
qplot(Petal.Width,Sepal.Width,colour=equalPredictions,data=testing)
\(\pagebreak\)
prostate
dataset from Elements of Statistical Learning is used# load data and set seed
data(prostate); set.seed(1)
# define outcome y and predictors x
covnames <- names(prostate[-(9:10)])
y <- prostate$lpsa; x <- prostate[,covnames]
# create test set predictors and outcomes
train.ind <- sample(nrow(prostate), ceiling(nrow(prostate))/2)
y.test <- prostate$lpsa[-train.ind]; x.test <- x[-train.ind,]
# create training set predictors and outcomes
y <- prostate$lpsa[train.ind]; x <- x[train.ind,]
# p = number of predictors
p <- length(covnames)
# initialize the list of residual sum squares
rss <- list()
# loop through each combination of predictors and build models
for (i in 1:p) {
# compute matrix for p choose i predictors for i = 1...p (creates i x p matrix)
Index <- combn(p,i)
# calculate residual sum squares of each combination of predictors
rss[[i]] <- apply(Index, 2, function(is) {
# take each combination (or column of Index matrix) and create formula for regression
form <- as.formula(paste("y~", paste(covnames[is], collapse="+"), sep=""))
# run linear regression with combination of predictors on training data
isfit <- lm(form, data=x)
# predict outcome for all training data points
yhat <- predict(isfit)
# calculate residual sum squares for predictions on training data
train.rss <- sum((y - yhat)^2)
# predict outcome for all test data points
yhat <- predict(isfit, newdata=x.test)
# calculate residual sum squares for predictions on test data
test.rss <- sum((y.test - yhat)^2)
# store each pair of training and test residual sum squares as a list
c(train.rss, test.rss)
})
}
# set up plot with labels, title, and proper x and y limits
plot(1:p, 1:p, type="n", ylim=range(unlist(rss)), xlim=c(0,p),
xlab="Number of Predictors", ylab="Residual Sum of Squares",
main="Prostate Cancer Data - Training vs Test RSS")
# add data points for training and test residual sum squares
for (i in 1:p) {
# plot training residual sum squares in blue
points(rep(i, ncol(rss[[i]])), rss[[i]][1, ], col="blue", cex = 0.5)
# plot test residual sum squares in red
points(rep(i, ncol(rss[[i]])), rss[[i]][2, ], col="red", cex = 0.5)
}
# find the minimum training RSS for each combination of predictors
minrss <- sapply(rss, function(x) min(x[1,]))
# plot line through the minimum training RSS data points in blue
lines((1:p), minrss, col="blue", lwd=1.7)
# find the minimum test RSS for each combination of predictors
minrss <- sapply(rss, function(x) min(x[2,]))
# plot line through the minimum test RSS data points in blue
lines((1:p), minrss, col="red", lwd=1.7)
# add legend
legend("topright", c("Train", "Test"), col=c("blue", "red"), pch=1)
NA
# load prostate data
data(prostate)
# create subset of observations with 10 variables
small = prostate[1:5,]
# print linear regression
lm(lpsa ~ .,data =small)
##
## Call:
## lm(formula = lpsa ~ ., data = small)
##
## Coefficients:
## (Intercept) lcavol lweight age lbph
## 9.60615 0.13901 -0.79142 0.09516 NA
## svi lcp gleason pgg45 trainTRUE
## NA NA -2.08710 NA NA
MASS
package] ridge<-lm.ridge(outcome ~ predictors, data=training, lambda=5)
= perform ridge regression with given outcome and predictors using the provided \(\lambda\) value
lambda=5
= tuning parameterridge$xm
= returns column/predictor mean from the dataridge$scale
= returns the scaling performed on the predictors for the ridge regression
ridge$coef
= returns the conditional coefficients, \(\beta_j\) from the ridge regressionridge$ym
= return mean of outcomecaret
package] train(outcome ~ predictors, data=training, method="ridge", lambda=5)
= perform ridge regression with given outcome and predictors
preProcess=c("center", "scale")
= centers and scales the predictors before the model is built
lambda=5
= tuning parametercaret
package] train(outcome ~ predictors, data=training, method="foba", lambda=5, k=4)
= perform ridge regression with variable selection
lambda=5
= tuning parameterk=4
= number of variables that should be retained
length(predictors)-k
variables will be eliminatedcaret
package] predict(model,test)
= use the model to predict on test set \(\rightarrow\) similar to all other caret
algorithmsprostate
dataset, we will run ridge regressions with different values of \(\lambda\) and find the optimum \(\lambda\) value that minimizes test RSSlars
package] lasso<-lars(as.matrix(x), y, type="lasso", trace=TRUE)
= perform lasso regression by adding predictors one at a time (or setting some variables to 0)
as.matrix(x)
= the predictors must be in matrix/dataframe formattrace=TRUE
= prints progress of the lasso regressionlasso$lambda
= return the \(\lambda\)s used for each step of the lasso regressionplot(lasso)
= prints plot that shows the progression of the coefficients as they are set to zero one by onepredit.lars(lasso, test)
= use the lasso model to predict on test data
?predit.lars
lars
package] cv.lars(as.matrix(x), y, K=10, type="lasso", trace=TRUE)
= computes K-fold cross-validated mean squared prediction error for lasso regression
lars
function is run K
times with each of the folds to estimate theK=10
= create 10-fold cross validationtrace=TRUE
= prints progress of the lasso regressionenet
package] lasso<-enet(predictors, outcome, lambda = 0)
= perform elastic net regression on given predictors and outcome
lambda=0
= default value for \(\lambda\)
lambda=0
tells the function to fit a lasso regression plot(lasso)
= prints plot that shows the progression of the coefficients as they are set to zero one by onepredict.ent(lasso, test)
= use the lasso model to predict on test datacaret
package] train(outcome ~ predictors, data=training, method="lasso")
= perform lasso regression with given outcome and predictors
preProcess=c("center", "scale")
= centers and scales the predictors before the model is built
caret
package] train(outcome~predictors,data=train,method="relaxo",lambda=5,phi=0.3)
= perform relaxed lasso regression on given predictors and outcome
lambda=5
= tuning parameterphi=0.3
= relaxation parameter
phi=1
corresponds to the regular Lasso solutionsphi=0
computes the OLS estimates on the set of variables selected by the Lassocaret
package] predict(model,test)
= use the model to predict on test set \(\rightarrow\) similar to all other caret
algorithmslars
package# load lars package
library(lars)
# perform lasso regression
lasso.fit <- lars(as.matrix(x), y, type="lasso", trace=TRUE)
# plot lasso regression model
plot(lasso.fit, breaks=FALSE, cex = 0.75)
# add legend
legend("topleft", covnames, pch=8, lty=1:length(covnames),
col=1:length(covnames), cex = 0.6)
# plots the cross validation curve
lasso.cv <- cv.lars(as.matrix(x), y, K=10, type="lasso", trace=TRUE)
\(\pagebreak\)
\[\begin{aligned} \mbox{majority vote accuracy} & = p(3~correct,~2~wrong) + p(4~correct,~1~wrong) \\ &\qquad+ p(5~correct) \\ & = {5 \choose 3} \times(0.7)^3(0.3)^2 + {5 \choose 4} \times(0.7)^4(0.3)^1 - {5 \choose 5} (0.7)^5 \\ & = 10 \times(0.7)^3(0.3)^2 + 5 \times(0.7)^4(0.3)^2 - 1 \times (0.7)^5 \\ & = 83.7% \\ \end{aligned}\]
# set up data
inBuild <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
validation <- Wage[-inBuild,]; buildData <- Wage[inBuild,]
inTrain <- createDataPartition(y=buildData$wage,p=0.7, list=FALSE)
training <- buildData[inTrain,]; testing <- buildData[-inTrain,]
# train the data using both glm and random forest models
glm.fit <- train(wage ~.,method="glm",data=training)
rf.fit <- train(wage ~.,method="rf",data=training,
trControl = trainControl(method="cv"),number=3)
# use the models to predict the results on the testing set
glm.pred.test <- predict(glm.fit,testing)
rf.pred.test <- predict(rf.fit,testing)
# combine the prediction results and the true results into new data frame
combinedTestData <- data.frame(glm.pred=glm.pred.test,
rf.pred = rf.pred.test,wage=testing$wage)
# run a Generalized Additive Model (gam) model on the combined test data
comb.fit <- train(wage ~.,method="gam",data=combinedTestData)
# use the resultant model to predict on the test set
comb.pred.test <- predict(comb.fit, combinedTestData)
# use the glm and rf models to predict results on the validation data set
glm.pred.val <- predict(glm.fit,validation)
rf.pred.val <- predict(rf.fit,validation)
# combine the results into data frame for the comb.fit
combinedValData <- data.frame(glm.pred=glm.pred.val,rf.pred=glm.pred.val)
# run the comb.fit on the combined validation data
comb.pred.val <- predict(comb.fit,combinedValData)
# tabulate the results - test data set RMSE Errors
rbind(test = c(glm = sqrt(sum((glm.pred.test-testing$wage)^2)),
rf = sqrt(sum((rf.pred.test-testing$wage)^2)),
combined = sqrt(sum((comb.pred.test-testing$wage)^2))),
# validation data set RMSE Errors
validation = c(sqrt(sum((glm.pred.val-validation$wage)^2)),
sqrt(sum((rf.pred.val-validation$wage)^2)),
sqrt(sum((comb.pred.val-validation$wage)^2))))
## glm rf combined
## test 858.7074 888.0702 849.3771
## validation 1061.0891 1086.2027 1057.8264
\(\pagebreak\)
Note: more detailed tutorial can be found in Rob Hyndman’s Forecasting: principles and practice
ma
vs EMA - ets
, see below) and apply corresponding functions to training dataforecast
functionaccuracy
functionsimple moving averages = prediction will be made for a time point by averaging together values from a number of prior periods \[ Y_{t}=\frac{1}{2*k+1}\sum_{j=-k}^k {y_{t+j}}\]
quantmod
package can be used to pull trading/price information for publicly traded stocks
getSymbols("TICKER", src="google", from=date, to=date)
= gets the daily high/low/open/close price and volume information for the specified stock ticker
"TICKER"
= ticker of the stock you are attempting to pull information forsrc="google"
= get price/volume information from Google finance
from
and to
= from and to dates for the price/volume information
date
objectsgetSymbols
can be found in the documentation ?getSymbols
to.monthly(GOOG)
= converts stock data to monthly time series from daily data
GOOG
= data frame returned from getSymbols
function?to.period
contains documentation for converting time series to OHLC (open high low close) series googOpen<-Op(GOOG)
= returns the opening price from the stock data frame
Cl(), Hi(), Lo()
= returns the close, high and low price from the stock data framets(googOpen, frequency=12)
= convert data to a time series with frequency
observations per time unit
frequency=12
= number of observations per unit time (12 in this case because there are 12 months in each year \(\rightarrow\) converts data into yearly time series)decompose(ts)
= decomposes time series into trend, seasonal, and irregular components by using moving averages
ts
= time series objectwindow(ts, start=1, end=6)
= subsets the time series at the specified starting and ending points
start
and end
arguments must correspond to the time unit rather than the index
ts
is a yearly series (frequency = 12
), start
/end
should correspond to the row numbers or year (each year has 12 observations corresponding to the months)c(1, 7)
can be used to specify the element of a particular year (in this case, July of the first year/row)start
/end
, and the closest element (June of the 9th year in this case) will be used end=9-0.01
can be used a short cut to specify “up to 9”, since end = 9
will include the first element of the 9th row forecast
package can be used for forecasting time series data
ma(ts, order=3)
= calculates the simple moving average for the order specified
order=3
= order of moving average smoother, effectively the number of values that should be used to calculate the moving averageets(train, model="MMM")
= runs exponential smoothing model on training data
model = "MMM"
= method used to create exponential smoothing
?ets
and the corresponding model chart is here forecast(ts)
= performs forecast on specified time series and returns 5 columns: forecast values, high/low 80 confidence intervals bounds, high/low 95 percent interval bounds
plot(forecast)
= plots the forecast object, which includes the training data, forecast values for test periods, as well as the 80 and 95 percent confidence interval regionsaccuracy(forecast, testData)
= returns the accuracy metrics (RMSE, etc.) for the forecast modelquandl
package is also used for finance-related predictions# load quantmod package
library(quantmod);
# specify to and from dates
from.dat <- as.Date("01/01/00", format="%m/%d/%y")
to.dat <- as.Date("3/2/15", format="%m/%d/%y")
# get data for AAPL from Google Finance for the specified dates
getSymbols("AAPL", src="google", from = from.dat, to = to.dat)
## [1] "AAPL"
# convert the retrieved daily data to monthly data
mAAPL <- to.monthly(AAPL)
# extract the closing price and convert it to yearly time series (12 observations per year)
ts <- ts(Cl(mAAPL), frequency = 12)
# plot the decomposed parts of the time series
plot(decompose(ts),xlab="Years")
# load forecast library
library(forecast)
# find the number of rows (years)
rows <- ceiling(length(ts)/12)
# use 90% of the data to create training set
ts.train <- window(ts, start = 1, end = floor(rows*.9)-0.01)
# use the rest of data to create test set
ts.test <- window(ts, start = floor(rows*.9))
# plot the training set
plot(ts.train)
# add the moving average in red
lines(ma(ts.train,order=3),col="red")
# compute the exponential smoothing average
ets <- ets(ts.train,model="MMM")
# construct a forecasting model using the exponential smoothing function
fcast <- forecast(ets)
# plot forecast and add actual data in red
plot(fcast); lines(ts.test,col="red")
# print the accuracy of the forecast model
accuracy(fcast,ts.test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1188298 2.825883 1.646959 -0.61217 10.68901 0.1924329
## Test set -7.8132889 16.736910 15.079222 -13.64900 20.31005 1.7618772
## ACF1 Theil's U
## Training set 0.09773823 NA
## Test set 0.84664431 3.360515
\(\pagebreak\)
kmeans(data, centers=3)
= can be used to perform clustering from the provided data
centers=3
= controls the number of clusters the algorithm should aim to divide the data intocl_predict
function in clue
package provides similar functionality# load iris data
data(iris)
# create training and test sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
training <- iris[inTrain,]; testing <- iris[-inTrain,]
# perform k-means clustering for the data without the Species information
# Species = what the true clusters are
kMeans1 <- kmeans(subset(training,select=-c(Species)),centers=3)
# add clusters as new variable to training set
training$clusters <- as.factor(kMeans1$cluster)
# plot clusters vs Species classification
p1 <- qplot(Petal.Width,Petal.Length,colour=clusters,data=training) +
ggtitle("Clusters Classification")
p2 <- qplot(Petal.Width,Petal.Length,colour=Species,data=training) +
ggtitle("Species Classification (Truth)")
grid.arrange(p1, p2, ncol = 2)
# tabulate the results from clustering and actual species
table(kMeans1$cluster,training$Species)
##
## setosa versicolor virginica
## 1 35 0 0
## 2 0 0 27
## 3 0 35 8
# build classification trees using the k-means cluster
clustering <- train(clusters ~.,data=subset(training,select=-c(Species)),method="rpart")
# tabulate the prediction results on training set vs truth
table(predict(clustering,training),training$Species)
##
## setosa versicolor virginica
## 1 35 0 0
## 2 0 0 29
## 3 0 35 6
# tabulate the prediction results on test set vs truth
table(predict(clustering,testing),testing$Species)
##
## setosa versicolor virginica
## 1 15 0 0
## 2 0 1 12
## 3 0 14 3