Previous: 3. Process data
Next: 5. Make predictions

4. Train models

Load packages and the Canadian programming library. Please note that caret automatically loads plyr. As I’m also using dplyr, it’s crucial that dplyr is loaded after plyr. Doing it the other way around causes problems. Also define two CPU cores to be used for model training.

suppressMessages({
  library(plyr)
  library(dplyr)
  library(caret)
  library(doMC)
})
source("canadian.R")
registerDoMC(cores=2)

Function to detect home/away team and winner.

team <- function(x, tied.eh=c("silent", "warn", "stop")) {
  counts <- sort(table(x), decreasing=TRUE)
  if ((length(counts) == 1) || (counts[1] > counts[2]))
    return(names(counts)[1])
  tied.eh <- match.arg(tied.eh)
  if (tied.eh == "error")
    stop("I'm sorry, but I couldn't figure out the winner: ",
      paste(x, collapse=", "))
  if (tied.eh == "warn")
    warning("I couldn't figure out the winner: ", paste(x, collapse=", "))
  x[1]
}

Function to add game statistis.

add_stats <- function(games, gamestats, which=c("both", "single", "overall")) {
  which <- match.arg(which)
  if (which == "overall") {
    away <- left_join(games, gamestats[["overall"]],
      by=c("season", awayteam="team"))
    home <- left_join(games, gamestats[["overall"]],
      by=c("season", hometeam="team"))
  } else {
    away <- left_join(games, gamestats[["away"]],
      by=c("season", awayteam="team"))
    home <- left_join(games, gamestats[["home"]],
      by=c("season", hometeam="team"))
  }

  if (which == "both") {
    away2 <- left_join(games, gamestats[["home"]],
      by=c("season", awayteam="team"))
    home2 <- left_join(games, gamestats[["away"]],
      by=c("season", hometeam="team"))
  }

  games$goals <- away$goals - home$goals
  games$shots <- away$shots - home$shots
  games$faceoffs <- away$faceoffs - home$faceoffs
  games$penalties <- away$penalties - home$penalties
  games$pp <- away$pp - home$pk
  games$pk <- away$pk - home$pp

  if (which == "both") {
    games$goals2 <- away2$goals - home2$goals
    games$shots2 <- away2$shots - home2$shots
    games$faceoffs2 <- away2$faceoffs - home2$faceoffs
    games$penalties2 <- away2$penalties - home2$penalties
    games$pp2 <- away2$pp - home2$pk
    games$pk2 <- away2$pk - home2$pp
  }

  games
}

Load and process.

load(file.path("source-data", "nhlscrapr-core.RData"))
rm(list=c("roster.master", "roster.unique"))

gamestats <- readRDS("processed.rds")

games <- tbl_df(games)
games <- games %>%
  filter(status != 0, session == "Playoffs", season != "20142015") %>%
  mutate(awayscore=as.integer(awayscore), homescore=as.integer(homescore))

playoffs <- games %>%
  mutate(winner=ifelse(awayscore > homescore, awayteam, hometeam)) %>%
  group_by(season, round=substring(gcode, 3, 3),
    series=substring(gcode, 4, 4)) %>%
  summarise(awayteam=team(awayteam), hometeam=team(hometeam),
    winner=team(winner, tied.eh="stop")) %>%
  select(season, round, series, awayteam, hometeam, winner) %>%
  mutate(winner=ifelse(awayteam == winner, "away", "home"))

playoffs <- add_stats(playoffs, gamestats)

playoffs <- playoffs %>%
  ungroup() %>%
  mutate(season=as.integer(substring(season, 5, 8))) %>%
  select(-season, -round, -series, -awayteam, -hometeam)

playoffs$winner = as.factor(playoffs$winner)
# settings
method <- "repeatedcv"
number <- 10
repeats <- 10
preProcess <- c("center", "scale")
tuneLength <- 5
metric <- "Accuracy"
maxVariables <- 5
seeds <- vector(mode="list", length=repeats*number+1)
for (i in seq_along(seeds))
  seeds[[i]] <- (1000*i+1):(1000*i+1+tuneLength^maxVariables)

fitControl <- trainControl(method=method, number=number, repeats=repeats,
  seeds=seeds)

model <- function(method) {
  message("Training model: ", method, "...", appendLF=FALSE)
  set.seed(7474505)
  suppressMessages({ captured <- capture.output({
    fit <- train(winner ~ ., data=playoffs,
      method=method, trControl=fitControl, preProcess=preProcess,
      metric=metric, tuneLength=tuneLength)
  })})
  message()
  fit
}

methods <- c("glm", "lda", "nnet", "rf", "svmLinear")
models <- lapply(methods, model)
## Training model: glm...
## Training model: lda...
## Training model: nnet...
## Training model: rf...
## Training model: svmLinear...
names(models) <- methods

saveRDS(models, "models.rds")

resamps <- resamples(models)
summary(resamps)
## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: glm, lda, nnet, rf, svmLinear 
## Number of resamples: 100 
## 
## Accuracy 
##             Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## glm       0.3750  0.5625 0.6250 0.6291  0.7059 0.8824    0
## lda       0.3750  0.5625 0.6360 0.6351  0.7059 0.8824    0
## nnet      0.4375  0.5542 0.6250 0.6221  0.6875 0.8824    0
## rf        0.2941  0.5625 0.6471 0.6464  0.7059 0.8824    0
## svmLinear 0.4118  0.5294 0.6250 0.6274  0.7059 0.8824    0
## 
## Kappa 
##              Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## glm       -0.2698 0.06849 0.2381 0.2272  0.3761 0.7463    0
## lda       -0.2698 0.06804 0.2381 0.2402  0.3796 0.7463    0
## nnet      -0.1168 0.09677 0.2131 0.2406  0.3599 0.7671    0
## rf        -0.3973 0.09160 0.2388 0.2533  0.3796 0.7571    0
## svmLinear -0.2687 0.06849 0.2381 0.2260  0.3761 0.7463    0
trellis.par.set(caretTheme())
bwplot(resamps, layout=c(2, 1))

dotplot(resamps, metric="Accuracy")

dotplot(resamps, metric="Kappa")

Next: 5. Make predictions
Previous: 3. Process data