if (!require("pacman")) install.packages("pacman")
pacman::p_load(caret, e1071, kernlab, ggplot2, ISLR, Hmisc, gridExtra, RANN,
AppliedPredictiveModeling)
library(AppliedPredictiveModeling)
data(AlzheimerDisease)
adData <- data.frame(diagnosis, predictors)
trainIndex <- createDataPartition(diagnosis, p=0.5, list=FALSE)
training <- adData[trainIndex,]
testing <- adData[-trainIndex,]
How to create a partition of training/test data, using 75% for training and 23% for testing, for an outcome spam$type:
library(caret); library(e1071); library(kernlab)
data(spam)
inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)
inTrain has indexes of all items selected to be in the training data set, so we subset training and testing.
training <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training)
## [1] 3451 58
Now, training has all index of selected in inTrain and testing all that is -inTrain (not in).
Splitting with K-folds, we pass the outcome, number of folds we want to create. We want each fold to be a list, and to return the training set.
set.seed(32323)
folds <- createFolds(y=spam$type, k=10, list=TRUE, returnTrain=TRUE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 4141 4140 4141 4142 4140 4142 4141 4141 4140 4141
The size of each fold is ~ 4141.
And here’s how we look at samples in fold one:
folds[[1]][1:10]
## [1] 1 2 3 4 5 6 7 8 9 10
We can also returns the test samples in each fold (returnTrain=FALSE):
set.seed(32323)
folds <- createFolds(y=spam$type, k=10, list=TRUE, returnTrain=FALSE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10
## 460 461 460 459 461 459 460 460 461 460
The size of each fold is ~ 461 (smaller than training, they are split in a 75/25 proportion).
And here’s how we look at samples in fold one:
folds[[1]][1:10]
## [1] 24 27 32 40 41 43 55 58 63 68
set.seed(32323)
folds <- createResample(y=spam$type, times=10, list=TRUE)
sapply(folds, length)
## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06
## 4601 4601 4601 4601 4601 4601
## Resample07 Resample08 Resample09 Resample10
## 4601 4601 4601 4601
The size of each fold is ~ 4601.
And here’s how we look at samples in fold one:
folds[[1]][1:10]
## [1] 1 2 3 3 3 5 5 7 8 12
Since we are resampling, we are seeing some og the items repeated in a fold.
Simple random sampling of time series is probably not the best way to resample times series data. Hyndman and Athanasopoulos (2013) discuss rolling forecasting origin techniques that move the training and test sets in time.
set.seed(32323)
tme <- 1:1000
folds <- createTimeSlices(y=tme, initialWindow=20, horizon=10)
names(folds)
## [1] "train" "test"
They are all taken in a row, trains and then tests:
folds$train[[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
We have 20 values (matching initialWindow)
folds$test[[1]]
## [1] 21 22 23 24 25 26 27 28 29 30
We have 10 values (matching horizon)
We use train function to fit a model:
args(train.default)
## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
## metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric %in%
## c("RMSE", "logLoss"), FALSE, TRUE), trControl = trainControl(),
## tuneGrid = NULL, tuneLength = 3)
## NULL
We could also have used trainControl for more calling parameters
args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method),
## 10, 25), repeats = ifelse(grepl("cv", method), 1, number),
## p = 0.75, search = "grid", initialWindow = NULL, horizon = 1,
## fixedWindow = TRUE, verboseIter = FALSE, returnData = TRUE,
## returnResamp = "final", savePredictions = FALSE, classProbs = FALSE,
## summaryFunction = defaultSummary, selectionFunction = "best",
## preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5),
## sampling = NULL, index = NULL, indexOut = NULL, indexFinal = NULL,
## timingSamps = 0, predictionBounds = rep(FALSE, 2), seeds = NA,
## adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE),
## trim = FALSE, allowParallel = TRUE)
## NULL
Now let’s predict type using all the other features in the training dataset we created before, using a generalized linear model, glm, as a method:
set.seed(32343)
modelFit <- train(type ~ ., data=training, method='glm')
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.9160009 0.8231214 0.005584604 0.01184724
##
##
As you can see, 57 predictors, 1 outcome (class nonspam/spam) and 3451 samples match the results from dim(training) (3451 rows and 58 columns). Some additional information, like the resampling method, in this case Bootstrapped (random sampling with replacement, aka bootstraping), with 25 repetitions.
Resampling brought accuracy to about 91.6%.
Most of these procedures are based on pseudo random numbers, so set.seed is crucial.
To take a peek of the model, in this case, coefficients between class (spam/nospam) and each of the 57 predictors:
modelFit$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) make address
## -1.438e+00 -1.839e-01 -1.489e-01
## all num3d our
## 1.175e-01 2.480e+00 5.420e-01
## over remove internet
## 9.038e-01 2.457e+00 4.973e-01
## order mail receive
## 5.102e-01 1.053e-01 -5.919e-01
## will people report
## -1.932e-01 -2.159e-01 2.981e-01
## addresses free business
## 8.834e-01 8.883e-01 9.754e-01
## email you credit
## 1.735e-01 7.511e-02 9.960e-01
## your font num000
## 2.553e-01 1.483e-01 1.911e+00
## money hp hpl
## 6.091e-01 -1.837e+00 -8.798e-01
## george num650 lab
## -1.179e+01 4.687e-01 -2.362e+00
## labs telnet num857
## -3.192e-01 -1.544e-01 1.026e+00
## data num415 num85
## -8.858e-01 5.891e-01 -2.025e+00
## technology num1999 parts
## 9.146e-01 3.811e-02 4.856e-01
## pm direct cs
## -8.030e-01 -4.246e-01 -5.553e+02
## meeting original project
## -2.624e+00 -1.211e+00 -2.089e+00
## re edu table
## -7.711e-01 -1.383e+00 -2.202e+00
## conference charSemicolon charRoundbracket
## -3.981e+00 -1.174e+00 -1.180e-01
## charSquarebracket charExclamation charDollar
## -4.938e-01 2.642e-01 5.037e+00
## charHash capitalAve capitalLong
## 2.437e+00 3.563e-03 1.021e-02
## capitalTotal
## 8.545e-04
##
## Degrees of Freedom: 3450 Total (i.e. Null); 3393 Residual
## Null Deviance: 4628
## Residual Deviance: 1408 AIC: 1524
To predict if items on dataset testing belong to any of the classes spam/nospam:
predictions <- predict(modelFit, newdata=testing)
head(predictions)
## [1] spam spam spam spam spam spam
## Levels: nonspam spam
To validade how close your predictions were to the class of the testing dataset, we generate a confusion matrix:
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 664 42
## spam 33 411
##
## Accuracy : 0.9348
## 95% CI : (0.9189, 0.9484)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8629
## Mcnemar's Test P-Value : 0.3556
##
## Sensitivity : 0.9527
## Specificity : 0.9073
## Pos Pred Value : 0.9405
## Neg Pred Value : 0.9257
## Prevalence : 0.6061
## Detection Rate : 0.5774
## Detection Prevalence : 0.6139
## Balanced Accuracy : 0.9300
##
## 'Positive' Class : nonspam
##
The reference gives us some idea of hits/misses, as well as others as accuracy, specificity, sensitivity, confidence intervals and Kappa values.
library(ISLR); library(ggplot2); library(caret)
data(Wage)
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
we can see the data is partial: all male, all in mid atlantic region.
Let’s use a 70% split of training and 30% testing:
inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training)
## [1] 2102 12
dim(testing)
## [1] 898 12
Let’s set testing set aside and don’t use it for anything at this point. Not even plotting!
To first way to get some insight into data is through a simple pairs plot, or scatter plot, between regressors and outcome:
featurePlot(x=training[,c('age','education','jobclass')], y=training$wage, plot='pairs')
From left lower to right top corner, on the diagonal are age, education, jobclass and y (wage). You are looking for any data that shows a trend, on this case it seems there is a positive correlation between education and wage for example.
We can look at a bi-dimensional relationship through simple qplot.
qplot(age, wage, data=training)
It shows some correlation, but to the next question - why is there a cluster on the top of the plot?
Let’s bring some color to the plot by adding a second regressor:
qplot(age, wage, colour=jobclass, data=training)
Many of the plots on the cluster seem to be on the ‘information’ job class, so that might explain a bit of the cluster.
We can also add regressor smoothers, a regression line for the second regressor on the plot, i.e.:
qplot(age, wage, colour=education, data=training) +
geom_smooth(method='lm', formula=y~x)
qplot(wage, colour=education, data=training, geom='density')
Shows that lot of people with less of HS education are concentrated under 100, and the highest concentration of advance degrees folks near 300.
We can use factors, a factorized version of the continuous variable, to make it easier to look for patterns:
library(Hmisc)
cutWage <- cut2(training$wage, g=3) # three quantile groups
table(cutWage)
## cutWage
## [ 20.1, 93.5) [ 93.5,118.9) [118.9,318.3]
## 715 700 687
qplot(cutWage, age, data=training, fill=cutWage, geom=c('boxplot'))
We can overlap boxplots with points (box plots hide extreme samples)
library(gridExtra)
p1 <- qplot(cutWage, age, data=training, fill=cutWage, geom=c('boxplot'))
p2 <- qplot(cutWage, age, data=training, fill=cutWage, geom=c('boxplot', 'jitter'))
grid.arrange(p1, p2, ncol=2)
We can use tables with the cut (factorized) variable of the continuous variable, to look for patterns
t1 <- table(cutWage, training$jobclass)
t1
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 93.5) 447 268
## [ 93.5,118.9) 352 348
## [118.9,318.3] 284 403
It shows for example that there are less people on the information class for lower wages.
We can use prop.table for the proportion on each column (or row). For 1, uses proportion on each row.
prop.table(t1, 1)
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 93.5) 0.6251748 0.3748252
## [ 93.5,118.9) 0.5028571 0.4971429
## [118.9,318.3] 0.4133916 0.5866084
i.e. 62% of low wage jobs are industrial class and 38% are information class.
library(AppliedPredictiveModeling); library(caret)
data(concrete)
summary(concrete)
## Cement BlastFurnaceSlag FlyAsh Water
## Min. :102.0 Min. : 0.0 Min. : 0.00 Min. :121.8
## 1st Qu.:192.4 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.:164.9
## Median :272.9 Median : 22.0 Median : 0.00 Median :185.0
## Mean :281.2 Mean : 73.9 Mean : 54.19 Mean :181.6
## 3rd Qu.:350.0 3rd Qu.:142.9 3rd Qu.:118.30 3rd Qu.:192.0
## Max. :540.0 Max. :359.4 Max. :200.10 Max. :247.0
## Superplasticizer CoarseAggregate FineAggregate Age
## Min. : 0.000 Min. : 801.0 Min. :594.0 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 932.0 1st Qu.:731.0 1st Qu.: 7.00
## Median : 6.400 Median : 968.0 Median :779.5 Median : 28.00
## Mean : 6.205 Mean : 972.9 Mean :773.6 Mean : 45.66
## 3rd Qu.:10.200 3rd Qu.:1029.4 3rd Qu.:824.0 3rd Qu.: 56.00
## Max. :32.200 Max. :1145.0 Max. :992.6 Max. :365.00
## CompressiveStrength
## Min. : 2.33
## 1st Qu.:23.71
## Median :34.45
## Mean :35.82
## 3rd Qu.:46.13
## Max. :82.60
set.seed(1000)
inTrain <- createDataPartition(mixtures$CompressiveStrength, p=3/4)[[1]]
training <- mixtures[inTrain,]
testing <- mixtures[-inTrain,]
featurePlot(x=training[,c('Cement','BlastFurnaceSlag','FlyAsh','Water')], y=training$CompressiveStrength, plot='pairs')
Cement is well correlated to strength.
featurePlot(x=training[,c('Superplasticizer','CoarseAggregate','FineAggregate','Age')],
y=training$CompressiveStrength, plot='pairs')
Course and fine aggregates are well correlated to strength.
qplot(CompressiveStrength, FlyAsh, data=training) +
geom_smooth(method='lm', formula=y~x)
No correlation between FlyAsh and strength.
plot(training$CompressiveStrength, pch=19)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$Cement), data=training)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$BlastFurnaceSlag), data=training)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$FlyAsh, g=2), data=training)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$Water), data=training)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$Superplasticizer), data=training)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$CoarseAggregate), data=training)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$FineAggregate), data=training)
qplot(1:length(training$CompressiveStrength), training$CompressiveStrength,
col=cut2(training$Age), data=training)
FlyAsh follows a bit the pattern of strength, but we cannot say it ‘perfectly explains’ the outcome vs index plot.
library(AppliedPredictiveModeling); library(caret)
data(concrete)
set.seed(1000)
inTrain <- createDataPartition(mixtures$CompressiveStrength, p=3/4)[[1]]
training <- mixtures[inTrain,]
testing <- mixtures[-inTrain,]
hist(training$Superplasticizer, xlab='super plasticizer')
The variable is skewed, and several values on zero. Would applying a log transform here would make it unskewed?
hist(log10(training$Superplasticizer+1), xlab='super plasticizer log10 transformed')
No. There are too many zeroes, so log transform does not unskew zero values.
library(caret); library(e1071); library(kernlab)
data(spam)
inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
hist(training$capitalAve, xlab='average capital run length')
Absolute majority of the run length are small, but a few are much larger.
c(mean(training$capitalAve), sd(training$capitalAve))
## [1] 5.180838 29.979038
Mean is low and values are skewed and highly variable. This will trick most ML algos.
trainCapAve <- training$capitalAve
trainCapAveS <- (trainCapAve - mean(trainCapAve))/sd(trainCapAve)
c(mean(trainCapAveS), sd(trainCapAveS))
## [1] -4.257002e-18 1.000000e+00
Standardized variables have \(\mu = 0\) and \(\sigma = 1\) on the training set.
IMPORTANT when you standardize a predictor in the training set, you have to manipulate the same predictor in the testing set as well, on a different formula:
testCapAve <- testing$capitalAve
testCapAveS <- (testCapAve - mean(trainCapAve))/sd(trainCapAve)
c(mean(testCapAveS), sd(testCapAveS))
## [1] 0.001424955 1.217384985
Standardization on the test set is calculated with mean and standard deviation of the training set. As a consequence standardization of variables on the test set have \(\mu \neq 0\) and \(\sigma \neq 1\) (but hopefully they will be close enough).
We can use the preProcess function to take care of all details:
preObj <- preProcess(training[,-58], method=c('center','scale')) # all variables in training except the outcome
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
c(mean(trainCapAveS), sd(trainCapAveS))
## [1] -4.257002e-18 1.000000e+00
We can use the resulting pre-processed object preObj to predict an outcome on the testing dataset, i.e.
testCapAveS <- predict(preObj, training[,-58])$capitalAve
c(mean(testCapAveS), sd(testCapAveS))
## [1] -4.257002e-18 1.000000e+00
Again, standardization of variables on the test set have \(\mu \neq 0\) and \(\sigma \neq 1\) even when using preProcess.
We can pre-process predictors during the train phase by using the preProcess argument:
set.seed(32343)
modelFit <- train(type ~ ., data=training, method='glm', preProcess=c('center','scale'))
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.922388 0.8363122 0.007182872 0.01453939
##
##
This is a useful data transformation technique used to stabilize variance, make the data more normal distribution-like and for other data stabilization procedures.
preObj <- preProcess(training[,-58], method=c('BoxCox')) # all variables in training except the outcome
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
you can see that there is still a large number of samples on the lower bound of the histogram, as well as the QQ plot is not entirelly linear.
Using K nearest neighbors imputation. Takes an average of the K closest matches to a missing value and replace that missing value by that average.
library(RANN)
set.seed(13343)
# make some NA values, so we can something to impute
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1], size=1, prob=0.05) == 1
training$capAve[selectNA] <- NA
# impute and standardize
preObj <- preProcess(training[,-58], method='knnImpute') # all variables in training except the outcome
capAve <- predict(preObj, training[,-58])$capAve
# standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth))/sd(capAveTruth)
quantile(capAve - capAveTruth)
## 0% 25% 50% 75% 100%
## -0.4648044369 0.0001057534 0.0007786010 0.0010934429 1.8216236106
quantile((capAve - capAveTruth)[selectNA])
## 0% 25% 50% 75% 100%
## -0.3112099676 -0.0167811049 0.0009838407 0.0193472829 1.8216236106
quantile((capAve - capAveTruth)[!selectNA])
## 0% 25% 50% 75% 100%
## -0.4648044369 0.0001554764 0.0007786010 0.0010756696 0.0013223803
\(Covariate = Predictors = Features\)
aSquare <- a^2)library(ISLR); library(ggplot2); library(caret)
data(Wage)
inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
Convert factor variables to indicator variables
table(training$jobclass)
##
## 1. Industrial 2. Information
## 1051 1051
These are string, prediction algorithms deal horribly with qualitative variables. Let’s transform them into quantitative variables.
dummies <- dummyVars(wage ~ jobclass, data=training)
head(predict(dummies,newdata=training))
## jobclass.1. Industrial jobclass.2. Information
## 86582 0 1
## 161300 1 0
## 155159 0 1
## 11443 0 1
## 376662 0 1
## 450601 1 0
You can see 0s and 1s indicating when a row is industrial and information, similar to a bit mask.
To remove regressors that have a very low variability so they might not be good predictors
nsv <- nearZeroVar(training, saveMetrics=TRUE)
nsv
## freqRatio percentUnique zeroVar nzv
## year 1.037356 0.33301618 FALSE FALSE
## age 1.027027 2.85442436 FALSE FALSE
## sex 0.000000 0.04757374 TRUE TRUE
## maritl 3.272931 0.23786870 FALSE FALSE
## race 8.938776 0.19029496 FALSE FALSE
## education 1.389002 0.23786870 FALSE FALSE
## region 0.000000 0.04757374 TRUE TRUE
## jobclass 1.000000 0.09514748 FALSE FALSE
## health 2.468647 0.09514748 FALSE FALSE
## health_ins 2.352472 0.09514748 FALSE FALSE
## logwage 1.061728 19.17221694 FALSE FALSE
## wage 1.061728 19.17221694 FALSE FALSE
For each line the algo calculates a percentage of unique values percentUnique and frequency ratio freqRatio. We can see that a few covariates are considered less relevant due to low variability: sex, region, etc.
One other way is to use a line to find regressors associated to orders of a polynomial function, i.e.:
library(splines)
bsBasis <- bs(training$age, df=3) # third degree polynomial variables
head(bsBasis)
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4163380 0.32117502 0.082587862
## [3,] 0.4308138 0.29109043 0.065560908
## [4,] 0.3625256 0.38669397 0.137491189
## [5,] 0.3063341 0.42415495 0.195763821
## [6,] 0.4241549 0.30633413 0.073747105
Where:
If you add these 3 covariates to the model you add 3 covariates that will allow for a curvy model fitting:
lm1 <- lm(wage ~ bsBasis, data=training)
plot(training$age, training$wage, pch=19, cex=0.5)
points(training$age, predict(lm1, newdata=training), col='red', pch=19, cex=0.5)
To predict on the test data set:
p <- predict(bsBasis, age=testing$age)
head(p)
## 1 2 3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.4163380 0.32117502 0.082587862
## [3,] 0.4308138 0.29109043 0.065560908
## [4,] 0.3625256 0.38669397 0.137491189
## [5,] 0.3063341 0.42415495 0.195763821
## [6,] 0.4241549 0.30633413 0.073747105
library(caret); library(e1071); library(kernlab)
data(spam)
inTrain <- createDataPartition(y=spam$type, p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
Extracting all features with a correlation greater than 0.8:
M <- abs(cor(training[,-58])) # matrix of correlations between all features (outcome excluded)
diag(M) <- 0 # diagonal would be 1, making it 0
which(M > 0.8, arr.ind=T)
## row col
## num415 34 32
## direct 40 32
## num857 32 34
## direct 40 34
## num857 32 40
## num415 34 40
Some of the features of high correlation are num415 and num857
names(spam)[c(34,32)]
## [1] "num415" "num857"
plot(spam[,34],spam[,32])
They lay on a line, so are highly correlated.
One way we could save on the number of predictors would be to rotate the plot
X <- 0.71*training$num415 + 0.71*training$num857
Y <- 0.71*training$num415 - 0.71*training$num857
plot(X, Y)
You could get to the same exact results by PCA:
smallSpam <- spam[,c(34,32)]
prComp <- prcomp(smallSpam)
plot(prComp$x[,1], prComp$x[,2])
Rotation matrix shows the exact coefficients used to obtain the principal components:
prComp$rotation
## PC1 PC2
## num415 0.7080625 0.7061498
## num857 0.7061498 -0.7080625
Doing PCA on SPAM data. If we were to plot only the first and second more important principal components
typeColor <- ((spam$type=='spam')*1 + 1) # black for spam red otherwise
prComp <- prcomp(log10(spam[,-58]+1)) # using log to make it more gaussian looking, all features
plot(prComp$x[,1], prComp$x[,2], col=typeColor, xlab='PC1', ylab='PC2')
We can see PC1 explains a lot of spam/mospam (above a certain threshold there is change from black to red)
We can pre-process using PCA as well:
preProc <- preProcess(log10(spam[,-58]+1), method='pca', pcaComp=2) # use 2 components only
spamPC <- predict(preProc, log10(spam[,-58]+1))
plot(spamPC[,1], spamPC[,2], col=typeColor)
And we can use the result to train a model, say glm.
preProc <- preProcess(log10(training[,-58]+1), method='pca', pcaComp=2) # use 2 components only
trainPC <- predict(preProc, log10(training[,-58]+1))
modelFit <- train(training$type ~ ., method='glm', data=trainPC)
And check the accuracy using a confusion matrix:
testPC <- predict(preProc, log10(testing[,-58]+1))
confusionMatrix(testing$type, predict(modelFit, testPC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 645 52
## spam 73 380
##
## Accuracy : 0.8913
## 95% CI : (0.8719, 0.9087)
## No Information Rate : 0.6243
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7705
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.8983
## Specificity : 0.8796
## Pos Pred Value : 0.9254
## Neg Pred Value : 0.8389
## Prevalence : 0.6243
## Detection Rate : 0.5609
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.8890
##
## 'Positive' Class : nonspam
##
Or you can use train, with preProcess='pca', to adjust components and train in one shot
modelFit <- train(training$type ~ ., method='glm', data=training, preProcess='pca')
confusionMatrix(testing$type, predict(modelFit, testing))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 658 39
## spam 50 403
##
## Accuracy : 0.9226
## 95% CI : (0.9056, 0.9374)
## No Information Rate : 0.6157
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8372
## Mcnemar's Test P-Value : 0.2891
##
## Sensitivity : 0.9294
## Specificity : 0.9118
## Pos Pred Value : 0.9440
## Neg Pred Value : 0.8896
## Prevalence : 0.6157
## Detection Rate : 0.5722
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.9206
##
## 'Positive' Class : nonspam
##
Problems can be stated the other way around, e.g. how many components are needed to achieve X% accuracy (threshold)? for example: on alzheimers dataset, using only predictors that begin with IL, what is the number of principal components needed to capture 90% of the variance?
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
subset <- training[,grep("^IL", names(training))]
preProcess(subset, method='pca', thresh=0.9)$numComp
## [1] 9
More specifically:
preProcess(subset, method='pca', thresh=0.9)$rotation
## PC1 PC2 PC3 PC4
## IL_11 -0.06529786 0.5555956867 0.2031317937 -0.050389599
## IL_13 0.27529157 0.3559427297 -0.0399010765 0.265076920
## IL_16 0.42079000 0.0007224953 0.0832211446 -0.082097273
## IL_17E -0.01126118 0.5635958176 0.3744707126 0.302512329
## IL_1alpha 0.25078195 -0.0687043488 -0.3008366900 0.330945942
## IL_3 0.42026485 -0.0703352892 -0.1049647272 -0.065352774
## IL_4 0.33302031 0.0688495706 -0.1395450144 0.165631691
## IL_5 0.38706503 -0.0039619980 0.0005616126 -0.224448981
## IL_6 0.05398185 -0.4248425653 0.6090821756 0.417591202
## IL_6_Receptor 0.21218980 0.1005338329 0.2920341087 -0.659953479
## IL_7 0.32948731 0.0806070090 -0.1966471906 0.165544952
## IL_8 0.29329723 -0.1883039842 0.4405255221 0.002811187
## PC5 PC6 PC7 PC8
## IL_11 0.73512798 -0.102014559 0.20984151 -0.08402367
## IL_13 -0.25796332 -0.068927711 0.58942516 -0.06839401
## IL_16 0.04435883 -0.007094672 -0.06581741 0.02665034
## IL_17E -0.38918707 0.221149380 -0.46462692 0.02185290
## IL_1alpha 0.16992452 0.742391473 0.12787035 -0.19555207
## IL_3 0.02352819 -0.165587911 -0.09006656 -0.15062164
## IL_4 -0.14268797 -0.297421293 0.19661173 0.57346657
## IL_5 0.08426042 0.153835977 -0.16425757 -0.02917286
## IL_6 -0.00165066 -0.166089521 0.21895103 -0.34568186
## IL_6_Receptor -0.29654048 0.138000448 0.22657846 -0.26274531
## IL_7 0.11373532 -0.405698338 -0.42065832 -0.40841984
## IL_8 0.28608600 0.184321013 -0.14833779 0.49101347
## PC9
## IL_11 0.183359387
## IL_13 -0.512677898
## IL_16 -0.225338083
## IL_17E 0.117769681
## IL_1alpha 0.256874424
## IL_3 0.014565029
## IL_4 0.591849422
## IL_5 0.003418637
## IL_6 0.221816813
## IL_6_Receptor 0.276527746
## IL_7 0.002607462
## IL_8 -0.311002624
Another use is to compare models and their accuracy. For example, using only predictors with variable names beginning with IL and diagnosis as outcome:
set.seed(3433)
ilColumns <- grep("^IL", names(training), value=TRUE)
df <- data.frame(diagnosis, predictors[, ilColumns])
inTrain = createDataPartition(df$diagnosis, p = 3/4)[[1]]
training = df[ inTrain,]
testing = df[-inTrain,]
Build two predictive models. One using the predictors as they are (no PCA):
modelFit <- train(diagnosis ~ ., method = "glm", data = training)
predictions <- predict(modelFit, newdata = testing)
cm1 <- confusionMatrix(predictions, testing$diagnosis)
print(cm1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 2 9
## Control 20 51
##
## Accuracy : 0.6463
## 95% CI : (0.533, 0.7488)
## No Information Rate : 0.7317
## P-Value [Acc > NIR] : 0.96637
##
## Kappa : -0.0702
## Mcnemar's Test P-Value : 0.06332
##
## Sensitivity : 0.09091
## Specificity : 0.85000
## Pos Pred Value : 0.18182
## Neg Pred Value : 0.71831
## Prevalence : 0.26829
## Detection Rate : 0.02439
## Detection Prevalence : 0.13415
## Balanced Accuracy : 0.47045
##
## 'Positive' Class : Impaired
##
And one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function:
modelFit <- train(training$diagnosis ~ ., method = "glm", preProcess = "pca", data = training,
trControl = trainControl(preProcOptions = list(thresh = 0.8)))
cm2 <- confusionMatrix(testing$diagnosis, predict(modelFit, testing))
print(cm2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 3 19
## Control 4 56
##
## Accuracy : 0.7195
## 95% CI : (0.6094, 0.8132)
## No Information Rate : 0.9146
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : 0.0889
## Mcnemar's Test P-Value : 0.003509
##
## Sensitivity : 0.42857
## Specificity : 0.74667
## Pos Pred Value : 0.13636
## Neg Pred Value : 0.93333
## Prevalence : 0.08537
## Detection Rate : 0.03659
## Detection Prevalence : 0.26829
## Balanced Accuracy : 0.58762
##
## 'Positive' Class : Impaired
##
What are the accuracies for each model?
c(cm1$overall[1], cm2$overall[1])
## Accuracy Accuracy
## 0.6463415 0.7195122
Predicting eruption time and duration of the “old faithful” geiser:
library(caret); data("faithful"); set.seed(333)
inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
trainFaith <- faithful[inTrain,]; testFaith <- faithful[-inTrain,]
head(trainFaith)
## eruptions waiting
## 1 3.600 79
## 3 3.333 74
## 5 4.533 85
## 6 2.883 55
## 7 4.700 88
## 8 3.600 85
We will fit a linear model on the following equation:
\(ED_i = b_0 + b_1 WT_i + e_i\)
Where \(ED_i\) is eruptions (the outcome), \(WT_i\) is waiting (the regressor) and e is error at time \(i\).
lm1 <- lm(eruptions ~ waiting, data=trainFaith)
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
On these results, \(b_0 = -1.792739\) and \(b_1 = 0.073901\). Visually, the fitted linear model looks something like this:
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col='blue', xlab='waiting', ylab='duration')
lines(trainFaith$waiting, lm1$fitted, lwd=3)
You can get to the same results using train with method='lm':
modFit <- train(eruptions ~ waiting, data=trainFaith, method='lm')
summary(modFit$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
We got to pretty much the same results, \(b_0 = -1.792739\) (Intercept) and \(b_1 = 0.073901\). (waiting).
To predict a new value we use estimated outcomes/regressors, denoted by a little ‘hat’ over the variables on the previous equation:
\(\hat {ED_i} = \hat {b_0} + \hat {b_1} WT_i\)
For example, what should “eruption”" time be for a waiting time of 80?
coef(lm1)[1] + coef(lm1)[2]*80
## (Intercept)
## 4.119307
Same thing, using predict:
predict(lm1, data.frame(waiting=80))
## 1
## 4.119307
par(mfrow=c(1,2))
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col='blue', xlab='waiting', ylab='duration',
main='training set')
lines(trainFaith$waiting, predict(lm1),lwd=3)
plot(testFaith$waiting, testFaith$eruptions, pch=19, col='blue', xlab='waiting', ylab='duration',
main='testing set')
lines(testFaith$waiting, predict(lm1, newdata=testFaith),lwd=3)
Doesn’t perfectly fit the training set, but it is a pretty good approximation for predictions on the testing set.
Root mean squared errors (RMSE) on training:
sqrt(sum((lm1$fitted - trainFaith$eruptions)^2))
## [1] 5.75186
RMSE on test:
sqrt(sum((predict(lm1, newdata=testFaith) - testFaith$eruptions)^2))
## [1] 5.838559
About the same, but of course the error on testing set is higher.
Parameter interval='prediction' tells predict to generate a prediction interval:
pred1 <- predict(lm1, newdata=testFaith, interval='prediction')
ord <- order(testFaith$waiting)
plot(testFaith$waiting, testFaith$eruptions, pch=19, col='blue') # test data predictor vs outcome
matlines(testFaith$waiting[ord], pred1[ord,], type='l',col=c(1,2,2),lty=c(1,1,1),lwd=2)
Partitioning the Wage dataset again, 70/30 between training and testing data sets:
library(ISLR); library(ggplot2); library(caret)
data(Wage)
inTrain <- createDataPartition(y=Wage$wage, p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training)
## [1] 2102 12
dim(testing)
## [1] 898 12
A linear model for multiple regressors would look something like this:
\(wage_i = b_0 + b_1 age + b_2 I(jobclass='Information') + \sum_{k=1}^{4} \gamma_k I(education_i = level_k)\)
where \(b_0\) is the intercept, and \(b_1\), \(b_2\) are the coefficients for age and a transformation of jobclass=‘Information’ is 1, 0 otherwise. For \(\gamma_k\), it is the coefficients of all educations from 1 to 4, when education = level is 1, 0 otherwise.
In the formula wage ~ age + jobclass + education, since jobclass and education are factors, it automatically (nice!) does the transformations for indicator function \(I(k)\) where k is factor, described above.
modFit <- train(wage ~ age + jobclass + education, method='lm', data=training)
print(modFit)
## Linear Regression
##
## 2102 samples
## 11 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 36.43771 0.267012 1.281502 0.01740793
##
##
You can see we have 10 predictors, one for age, 2+1 for jobclass and 5+1 for education, where +1 is the predictor for no factor selected (created by default).
We use diagnostic plots to get some insight into wether or not our model is missing important predictors.
finMod <- modFit$finalModel
plot(finMod, 1, pch=19, cex=0.5, col='#00000010')
Is the residuals vs fitted plot. We want to see a straight line on residuals=0 (what is not, for higher fitted values) and small number of outliers (what we are not seeing, look at the numbers on the top).
We can also color by variables not used in the original model:
qplot(finMod$fitted, finMod$residuals, col=race, data=training)
We can see that several outliers are related to race (white race), indicating that race is a potential missing regressor.
Index is just the position of a sample on the dataset.
plot(finMod$residuals, pch=19)
We can see a clear trend, residuals grow with index. Also more outliers as index grow (right side of the plot). This indicates a potential regressor is missing in our model, usually related to some time related continous variable (age, date, timestamp, etc.)
This is a post-mortem analysis, do not go back to your model and change it based on what you find here (big chances of overfitting if you do so)
pred <- predict(modFit, testing)
qplot(wage, pred, col=year, data=testing)
You are looking for a linear, 45 degrees fitted line between predicted and truth, no outliers. You add color to find a variable that might be potentially impeding that linear relationship from occuring.
modFitAll <- train(wage ~ ., method='lm', data=training)
print(modFitAll)
## Linear Regression
##
## 2102 samples
## 11 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 12.48925 0.9108799 0.7653664 0.005605719
##
##
We can see we are doing better, RMSE dropped from 36.43771 to 12.48925 for example.
pred <- predict(modFitAll, testing)
qplot(wage, pred, data=testing)
Partitioning the data:
library(ElemStatLearn)
##
## Attaching package: 'ElemStatLearn'
## The following object is masked _by_ '.GlobalEnv':
##
## spam
data(SAheart)
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]
head(trainSA)
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 238 176 5.76 4.89 26.10 Present 46 27.30 19.44 57 0
## 114 174 0.00 8.46 35.10 Present 35 25.27 0.00 61 1
## 312 174 3.50 5.26 21.97 Present 36 22.04 8.33 59 1
## 301 166 4.10 4.00 34.30 Present 32 29.51 8.23 53 0
## 311 130 0.05 2.44 28.25 Present 67 30.86 40.32 34 0
## 179 128 0.04 8.22 28.17 Absent 65 26.24 11.73 24 0
Fitting a model:
set.seed(13234)
modFit <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl,
method='glm', family='binomial', data=trainSA)
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to
## do classification? If so, use a 2 level factor as your outcome column.
Calculating misclassification:
missClass <- function(values,prediction){
sum(((prediction > 0.5)*1) != values)/length(values)
}
Test set misclassification:
missClass(testSA$chd, predict(modFit, newdata=testSA))
## [1] 0.3116883
Training set misclassification:
missClass(trainSA$chd, predict(modFit, newdata=trainSA))
## [1] 0.2727273