library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
options(jupyter.plot_mimetypes = "image/png")
options(repr.plot.width = 16/2, repr.plot.height = 9/2)
Column 1: Number of passengers who were denied boarding involuntarily who qualified for denied boarding compensation and:
Column 2: Number of passengers denied boarding involuntarily who did not qualify for denied boarding compensation
Column 3: Total number denied boarding involuntarily
Column 4: Number of passengers denied boarding involuntarily who actually received compensation
Column 5: Number of passengers who volunteered to give up reserved space in exchange for a payment of the carrier's choosing
Column 6: Number of Passengers accommodated in another section of the aircraft
Column 7: Total Boardings
Column 8: Amount of Compensation paid to passenger who:
base_url <- "https://www.rita.dot.gov/bts/sites/rita.dot.gov.bts/files/"
datasets <- c("2014_1q", "2014_2q", "2014_3q", "2014_4q",
"2015_1q", "2015_2q", "2015_3q", "2015_4q",
"2016_1q", "2016_4q_0")
# The data for 2016-Q2 and 2016-Q3 aren't available in CSV format,
# and somehow the file for 4q is suffixed with '_0'.
data <- do.call(rbind, lapply(
datasets,
function(q) {
raw <- fread(paste(base_url, q, ".csv", sep=""), header=FALSE)
raw <- raw[, 1:15]
print(paste("Processing:", raw[2, 1]))
header <- as.character(raw[raw$V1 == "CARRIER", ])
raw <- raw[(raw$V2 != "") & (raw$V1 != "CARRIER"), ] # Exclude metatdata/header rows
colnames(raw) <- header
data_by_quarter <- cbind(
raw[, 1], # Carrier name
sapply(raw[, -1], function(col) { as.numeric(gsub("[,|$]", "", col)) } ), # Remove , and $ and convert to numeric
quarter = paste(substring(q, 1, 4), paste("Q", substring(q, 6, 6), sep="")) # Specify quarter
)
return(data_by_quarter)
}
))
head(data)
# Remove airlines that aren't in all datasets
print("Before: ")
unique(data$CARRIER)
data <- data %>%
group_by(CARRIER) %>%
filter(n() == length(datasets)) %>%
ungroup()
print("After: ")
unique(data$CARRIER)
# Stats
reaccommodated <- data %>%
bind_rows(data %>%
group_by(CARRIER) %>%
select(-quarter) %>%
summarise_each(funs(sum)) %>%
mutate(quarter = "overall")
) %>%
mutate(
CARRIER = factor(CARRIER),
quarter = factor(quarter),
voluntary = `5`,
involuntary = `3`,
reaccommodated = `3` + `5`,
boarded = `7`,
rate = reaccommodated / boarded * 10000,
rate_voluntary = voluntary / boarded * 10000,
rate_involuntary = involuntary / boarded * 10000,
avg_comp = (`8(a)` + `8(b)` + `8(c)`) / reaccommodated
)
# Rank by 2016 Q4 stats
rank_2016q4 <- reaccommodated %>%
filter(quarter == '2016 Q4') %>%
mutate(rank = min_rank(-rate)) %>%
.$rank
reaccommodated$CARRIER <- factor(reaccommodated$CARRIER, levels = reaccommodated$CARRIER[order(-rank_2016q4)])
ggplot(reaccommodated %>% filter(quarter == '2016 Q4'), aes(x = CARRIER, y = rate)) +
geom_bar(stat = "identity", width = 0.5, fill = "#f1ad46") +
coord_flip() +
labs(x = "", y = "", title = "Denied Boardings per 10,000 passengers (Q4 2016)")
ggplot(reaccommodated %>% filter(quarter == '2016 Q4'), aes(size = reaccommodated, y = boarded / 10000, x = rate, label = CARRIER)) +
geom_point(fill = "#f1ad46", alpha = .6, colour = "white", shape = 21, stroke = 2) +
geom_point(color = "black", size = .3) +
geom_text(aes(y = boarded / 10000 + 250), size = 2.8, hjust = 0.5, vjust = 1) +
scale_size_continuous(range=c(1,15)) +
labs(x = "Denied Boarding Rate (per 10,000 passengers)",
y = "Boardings (10,000 passengers)",
size = "Passengers denied boarding (passengers)",
title = "Number of passengers denied boarding (Q4 2016)") +
theme(legend.position = "bottom")
reaccommodated %>% filter(quarter == '2016 Q4') %>% arrange(reaccommodated)
options(repr.plot.width = 8, repr.plot.height = 6)
reaccommodated %>%
filter(quarter != "overall") %>%
mutate(
year = substring(quarter, 1, 4),
q = substring(quarter, 6, 7)
) %>%
ggplot(aes(x = CARRIER, y = rate)) +
geom_bar(stat = "identity", width = 0.5, fill = "#f1ad46") +
facet_grid(year ~ q) +
coord_flip() +
labs(x = "", y = "", title = "Denied Boardings per 10,000 passengers (2014-2016)")
options(repr.plot.width = 16/2, repr.plot.height = 9/2)
reaccommodated %>%
filter(quarter == "overall") %>%
select(CARRIER, voluntary = rate_voluntary, involuntary = rate_involuntary) %>%
gather("type", "rate", 2:3) %>%
ggplot(aes(x = CARRIER, y = rate, fill = factor(type, levels=c("involuntary", "voluntary")))) +
geom_bar(stat = "identity", width = 0.5) +
coord_flip() +
scale_fill_manual(values = c("#f1ad46", "#48B6A3"), labels = c("Involuntary ", "Voluntary")) +
labs(x = "", y = "", title = "Denied Boardings per 10,000 passengers (2014-2016)", fill = "") +
theme(legend.position = "bottom")
reaccommodated %>%
filter(quarter == "overall") %>%
ggplot(aes(x = factor(CARRIER, levels=CARRIER[order(rate_involuntary)]), y = rate_involuntary)) +
geom_bar(stat = "identity", width = 0.5, fill = "#f1ad46") +
coord_flip() +
labs(x = "", y = "", title = "Involuntary Denied Boardings per 10,000 passengers (2014-2016)")
reaccommodated %>%
filter(quarter == "overall") %>%
ggplot(aes(x = factor(CARRIER, levels = CARRIER[order(avg_comp)]), y = avg_comp)) +
geom_bar(stat = "identity", width = 0.5, fill = "#f1ad46") +
coord_flip() +
labs(x = "", y = "USD", title = "Average compensation paid to passengers denied boarding (2014-2016)") +
theme(text = element_text(family = "Helvetica", size = 12))