Previous: 3. Process data
Next: 5. Make predictions
Load packages and the Canadian programming library. Please note that caret
automatically loads plyr
. As I’m also using dplyr
, it’s crucial that dplyr
is loaded after plyr
. Doing it the other way around causes problems. Also define two CPU cores to be used for model training.
suppressMessages({
library(plyr)
library(dplyr)
library(caret)
library(doMC)
})
source("canadian.R")
registerDoMC(cores=2)
Function to detect home/away team and winner.
team <- function(x, tied.eh=c("silent", "warn", "stop")) {
counts <- sort(table(x), decreasing=TRUE)
if ((length(counts) == 1) || (counts[1] > counts[2]))
return(names(counts)[1])
tied.eh <- match.arg(tied.eh)
if (tied.eh == "error")
stop("I'm sorry, but I couldn't figure out the winner: ",
paste(x, collapse=", "))
if (tied.eh == "warn")
warning("I couldn't figure out the winner: ", paste(x, collapse=", "))
x[1]
}
Function to add game statistis.
add_stats <- function(games, gamestats, which=c("both", "single", "overall")) {
which <- match.arg(which)
if (which == "overall") {
away <- left_join(games, gamestats[["overall"]],
by=c("season", awayteam="team"))
home <- left_join(games, gamestats[["overall"]],
by=c("season", hometeam="team"))
} else {
away <- left_join(games, gamestats[["away"]],
by=c("season", awayteam="team"))
home <- left_join(games, gamestats[["home"]],
by=c("season", hometeam="team"))
}
if (which == "both") {
away2 <- left_join(games, gamestats[["home"]],
by=c("season", awayteam="team"))
home2 <- left_join(games, gamestats[["away"]],
by=c("season", hometeam="team"))
}
games$goals <- away$goals - home$goals
games$shots <- away$shots - home$shots
games$faceoffs <- away$faceoffs - home$faceoffs
games$penalties <- away$penalties - home$penalties
games$pp <- away$pp - home$pk
games$pk <- away$pk - home$pp
if (which == "both") {
games$goals2 <- away2$goals - home2$goals
games$shots2 <- away2$shots - home2$shots
games$faceoffs2 <- away2$faceoffs - home2$faceoffs
games$penalties2 <- away2$penalties - home2$penalties
games$pp2 <- away2$pp - home2$pk
games$pk2 <- away2$pk - home2$pp
}
games
}
Load and process.
load(file.path("source-data", "nhlscrapr-core.RData"))
rm(list=c("roster.master", "roster.unique"))
gamestats <- readRDS("processed.rds")
games <- tbl_df(games)
games <- games %>%
filter(status != 0, session == "Playoffs", season != "20142015") %>%
mutate(awayscore=as.integer(awayscore), homescore=as.integer(homescore))
playoffs <- games %>%
mutate(winner=ifelse(awayscore > homescore, awayteam, hometeam)) %>%
group_by(season, round=substring(gcode, 3, 3),
series=substring(gcode, 4, 4)) %>%
summarise(awayteam=team(awayteam), hometeam=team(hometeam),
winner=team(winner, tied.eh="stop")) %>%
select(season, round, series, awayteam, hometeam, winner) %>%
mutate(winner=ifelse(awayteam == winner, "away", "home"))
playoffs <- add_stats(playoffs, gamestats)
playoffs <- playoffs %>%
ungroup() %>%
mutate(season=as.integer(substring(season, 5, 8))) %>%
select(-season, -round, -series, -awayteam, -hometeam)
playoffs$winner = as.factor(playoffs$winner)
# settings
method <- "repeatedcv"
number <- 10
repeats <- 10
preProcess <- c("center", "scale")
tuneLength <- 5
metric <- "Accuracy"
maxVariables <- 5
seeds <- vector(mode="list", length=repeats*number+1)
for (i in seq_along(seeds))
seeds[[i]] <- (1000*i+1):(1000*i+1+tuneLength^maxVariables)
fitControl <- trainControl(method=method, number=number, repeats=repeats,
seeds=seeds)
model <- function(method) {
message("Training model: ", method, "...", appendLF=FALSE)
set.seed(7474505)
suppressMessages({ captured <- capture.output({
fit <- train(winner ~ ., data=playoffs,
method=method, trControl=fitControl, preProcess=preProcess,
metric=metric, tuneLength=tuneLength)
})})
message()
fit
}
methods <- c("glm", "lda", "nnet", "rf", "svmLinear")
models <- lapply(methods, model)
## Training model: glm...
## Training model: lda...
## Training model: nnet...
## Training model: rf...
## Training model: svmLinear...
names(models) <- methods
saveRDS(models, "models.rds")
resamps <- resamples(models)
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: glm, lda, nnet, rf, svmLinear
## Number of resamples: 100
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glm 0.3750 0.5625 0.6250 0.6291 0.7059 0.8824 0
## lda 0.3750 0.5625 0.6360 0.6351 0.7059 0.8824 0
## nnet 0.4375 0.5542 0.6250 0.6221 0.6875 0.8824 0
## rf 0.2941 0.5625 0.6471 0.6464 0.7059 0.8824 0
## svmLinear 0.4118 0.5294 0.6250 0.6274 0.7059 0.8824 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glm -0.2698 0.06849 0.2381 0.2272 0.3761 0.7463 0
## lda -0.2698 0.06804 0.2381 0.2402 0.3796 0.7463 0
## nnet -0.1168 0.09677 0.2131 0.2406 0.3599 0.7671 0
## rf -0.3973 0.09160 0.2388 0.2533 0.3796 0.7571 0
## svmLinear -0.2687 0.06849 0.2381 0.2260 0.3761 0.7463 0
trellis.par.set(caretTheme())
bwplot(resamps, layout=c(2, 1))
dotplot(resamps, metric="Accuracy")
dotplot(resamps, metric="Kappa")
Next: 5. Make predictions
Previous: 3. Process data