old ipl project. Lots of suboptimal decisions date:2022-04-17 rendered: 2025-03-31
Code
# Load necessary libraries
# Using suppressPackageStartupMessages to keep output clean
suppressPackageStartupMessages({
library(ggtext)
library(lme4)
library(tidyverse)
library(plotly)
library(viridisLite) # For better color scales
})
Indian Premier League
The goal of the present project is to download, clean, wrangle, and obtain insights from ipl data.
The data will be downloaded from cricsheet.org in a zip format. We will unzip the files to a directory.
Code
temp <- tempfile()
download.file("https://cricsheet.org/downloads/ipl_csv2.zip", temp)
unzip(zipfile = temp, exdir = "ipl_data") #takes a couple of minutes
rm(temp)
The next step would be load the data to R. We can achieve that using the read_csv
function. However, while this data file is large and contains ball-by-ball details, it does not have match info i.e., who won the match and by how many wickets/runs.
We will obtain that data using the match info files. We can read only those files by adding them as a list for now.
Code
df <- read_csv("ipl_data/all_matches.csv") # read the data
files <- list.files(path = "ipl_data", pattern = "*_info.csv") #load the paths of
#metadata files ----
files <- paste0("ipl_data/", files) #append
One issue is that there are hundreds of files that match our pattern, and it is not possible to load them and use them effectively if we do it manually. One solution is to write a function that takes each of the file, appends it to a dataframe.
Code
f1 <- function(x) {
y <- read.table(files[x], #load file
sep = ",",
skip = 2,
col.names = paste0("V",seq_len(5)), fill = TRUE)
y <- y[,-1] #remove the first column 'info'
y <- y %>% group_by(V2) %>% #number the column so that it can be pivoted wo errors
mutate(id = seq_along(V2)) %>%
ungroup() %>%
mutate(V2 = paste0(.$V2, .$id)) %>%
select(1, 2)
y <- pivot_wider(y, names_from = V2, values_from = V3) %>% # lay data on side
mutate(match_id = parse_number(file.path(files[x]))) %>%
select(!starts_with(c("player","regis")))
# the info file doesn't contain the match id! Parsing it from file names
}
df_info <- f1(1) # read the (meta)data file one
df_info <- df_info[FALSE, ] #keep only the headers
for (i in 1:length(files)) {
X <- f1(i)
df_info <- bind_rows(df_info, X)
}
df_info <- df_info %>%
rename_with(~str_remove(., "1")) %>%
select(match_id, match_number, everything()) %>%
select(!c(event, gender, venue)) %>%
arrange(match_id)
Let’s take a look at the data frames we have so far.
Code
final_df <- inner_join(df, df_info, by = 'match_id') # ugly but works..
#Clean up seasons
final_df <- final_df %>%
mutate(season.x = case_when(
season.y == "2007/08" ~ '2008',
season.y == "2009/10" ~ '2010',
season.y == "2020/21" ~ '2020',
TRUE ~ season.y),
season.x = as.numeric(as.character(season.x)))
#write.csv(final_df, "final_df.csv")
nanoparquet::write_parquet(final_df, "cricket/final_df.parquet") # 2025 change: saved 100x disk space
Load the final cleaned up data
Code
#final_df <- read.csv("final_df.csv")
final_df <- nanoparquet::read_parquet("cricket/final_df.parquet")
List of all teams that have played IPL so far
Code
(team_list <- final_df %>%
distinct(team))
# A data frame: 19 × 1
team
<chr>
1 Royal Challengers Bangalore
2 Kings XI Punjab
3 Delhi Daredevils
4 Kolkata Knight Riders
5 Mumbai Indians
6 Rajasthan Royals
7 Deccan Chargers
8 Chennai Super Kings
9 Kochi Tuskers Kerala
10 Pune Warriors
11 Sunrisers Hyderabad
12 Gujarat Lions
13 Rising Pune Supergiants
14 Rising Pune Supergiant
15 Delhi Capitals
16 Punjab Kings
17 Lucknow Super Giants
18 Gujarat Titans
19 Royal Challengers Bengaluru
Exploring the Data
Lets start by making a function that gives some basic details about any given team
Code
team_stat <- function(team_name) {
f1 <- final_df %>% #_team_select
group_by(season.x) %>%
arrange(desc(match_id)) %>%
filter(grepl(team_name, team, ignore.case = T) |
grepl(team_name, team2, ignore.case = T))
f2 <- distinct(.data = f1, match_id) %>% count() #match count
f3 <- filter(f1, grepl(team_name, winner, ignore.case = T)) %>%#winning matches
distinct(match_id) %>% count()
f4 <- filter(f1, grepl(team_name, winner, ignore.case = T))
df <- inner_join(f2, f3, by = "season.x") %>%
rename(matches = n.x, wins = n.y) %>%
mutate(win_rate = round(wins/matches * 100))
df$team <- f4[1,36] # add a new column with the team name
df <- df %>% unnest(last_col()) %>%
rename(team = last_col()) %>% #rename the column
ungroup() #ungroup the df
return(df)
}
This particular function takes regex input
Code
# A tibble: 12 × 5
season.x matches wins win_rate team
<dbl> <int> <int> <dbl> <chr>
1 2013 17 9 53 Sunrisers Hyderabad
2 2014 14 6 43 Sunrisers Hyderabad
3 2015 14 7 50 Sunrisers Hyderabad
4 2016 17 11 65 Sunrisers Hyderabad
5 2017 14 8 57 Sunrisers Hyderabad
6 2018 17 10 59 Sunrisers Hyderabad
7 2019 15 6 40 Sunrisers Hyderabad
8 2020 16 8 50 Sunrisers Hyderabad
9 2021 14 3 21 Sunrisers Hyderabad
10 2022 14 6 43 Sunrisers Hyderabad
11 2023 14 4 29 Sunrisers Hyderabad
12 2024 16 9 56 Sunrisers Hyderabad
Code
# A tibble: 17 × 5
season.x matches wins win_rate team
<dbl> <int> <int> <dbl> <chr>
1 2008 14 7 50 Delhi Capitals
2 2009 15 9 60 Delhi Capitals
3 2010 14 7 50 Delhi Capitals
4 2011 14 4 29 Delhi Capitals
5 2012 18 11 61 Delhi Capitals
6 2013 16 3 19 Delhi Capitals
7 2014 14 2 14 Delhi Capitals
8 2015 14 5 36 Delhi Capitals
9 2016 14 7 50 Delhi Capitals
10 2017 14 6 43 Delhi Capitals
11 2018 14 5 36 Delhi Capitals
12 2019 16 9 56 Delhi Capitals
13 2020 17 8 47 Delhi Capitals
14 2021 16 9 56 Delhi Capitals
15 2022 14 7 50 Delhi Capitals
16 2023 14 5 36 Delhi Capitals
17 2024 14 7 50 Delhi Capitals
Code
# A tibble: 4 × 5
season.x matches wins win_rate team
<dbl> <int> <int> <dbl> <chr>
1 2016 14 5 36 Lucknow Super Giants
2 2022 15 9 60 Lucknow Super Giants
3 2023 15 8 53 Lucknow Super Giants
4 2024 14 7 50 Lucknow Super Giants
Code
# A tibble: 1 × 5
season.x matches wins win_rate team
<dbl> <int> <int> <dbl> <chr>
1 2017 16 10 62 Rising Pune Supergiant
What about boundaries (4s and 6s)
Code
boundaries <- final_df %>%
group_by(match_id, runs_off_bat) %>%
count(runs_off_bat) %>% filter(runs_off_bat == 4 | runs_off_bat == 6 ) %>%
ungroup() %>%
mutate(match_no = row_number()) %>%
select(match_no, everything())
p5 <- boundaries %>%
inner_join(.,first_innings_run, by = 'match_id') %>%
ggplot(aes(x = match_no.y, y = n, color = as.factor(runs_off_bat.x))) +
geom_point()+
geom_smooth()+
labs(
x = "Matches",
y = "Number of boundaries per match",
title = "Boundaries and IPL matches",
col = "4/6"
)
ggplotly(p5)
Stats
A simple linear model for testing for increase in runs over time
Code
lm(total_runs ~ match_no, first_innings_run) %>% summary()
Call:
lm(formula = total_runs ~ match_no, data = first_innings_run)
Residuals:
Min 1Q Median 3Q Max
-108.533 -19.091 -0.118 21.021 108.646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 151.67752 1.87866 80.737 <2e-16 ***
match_no 0.02531 0.00297 8.523 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 31.06 on 1093 degrees of freedom
Multiple R-squared: 0.06232, Adjusted R-squared: 0.06146
F-statistic: 72.64 on 1 and 1093 DF, p-value: < 2.2e-16