Upping your storytelling game

…with help from tools that other already people made.

Mara Averick
2019-03-29

library(tidyverse)
library(glue)

library(gt)

library(hrbrthemes)
library(ggpomological)

library(highcharter)

Grab the the most recent advanced metrics from basketball reference using the {nbastatR} package by Alex Bresler. Note, running bref_players_stats() will assign the output data frames, dataBREFPlayerTotals and dataBREFPlayerAdvanced, to the environment, so we don’t need to do anything else (I rename them for my own sanity).


library(nbastatR)
bref_players_stats(seasons = 2019, tables = c("advanced", "totals"), 
                   widen = TRUE, assign_to_environment = TRUE)
bref_advanced <- dataBREFPlayerAdvanced
bref_totals <- dataBREFPlayerTotals

I always like to start out by skimming with the skimr package…


skimr::skim(bref_advanced)
#> Skim summary statistics
#>  n obs: 501 
#>  n variables: 36 
#> 
#> ── Variable type:character ──────────────────────────────────────────────────────────────────
#>            variable missing complete   n min max empty n_unique
#>            idPlayer       0      501 501   6   9     0      501
#>      idPlayerSeason       0      501 501  11  14     0      501
#>          idPosition       0      501 501   1   5     0       13
#>          namePlayer       0      501 501   4  24     0      501
#>      namePlayerBREF       0      501 501   7  24     0      501
#>          slugSeason       0      501 501   7   7     0        1
#>        slugTeamBREF       0      501 501   3   3     0       31
#>   urlPlayerHeadshot      15      486 501  86  89     0      486
#>  urlPlayerThumbnail       0      501 501  52  89     0      501
#> 
#> ── Variable type:logical ────────────────────────────────────────────────────────────────────
#>     variable missing complete   n mean           count
#>  isHOFPlayer       0      501 501    0 FAL: 501, NA: 0
#> 
#> ── Variable type:numeric ────────────────────────────────────────────────────────────────────
#>         variable missing complete   n       mean         sd      p0
#>        agePlayer       0      501 501     26.08       4.18    19   
#>       countGames       0      501 501     44.04      22.57     1   
#>      idPlayerNBA       0      501 501 880176.29  724825.17  1114   
#>          minutes       0      501 501   1005.99     745.5      1   
#>        pct3PRate       5      496 501      0.36       0.21     0   
#>           pctAST       0      501 501      0.13       0.094    0   
#>           pctBLK       0      501 501      0.17       0.28     0   
#>           pctDRB       0      501 501      0.15       0.084    0   
#>        pctFTRate       5      496 501      0.24       0.14     0   
#>           pctORB       0      501 501      0.073      0.14     0   
#>           pctSTL       0      501 501      0.12       0.26     0   
#>           pctTOV       4      497 501      0.12       0.055    0   
#>           pctTRB       0      501 501      0.1        0.059    0   
#>  pctTrueShooting       4      497 501      0.53       0.11     0   
#>           pctUSG       0      501 501      0.19       0.059    0   
#>         ratioBPM       0      501 501     -1.95       5.61   -52.7 
#>        ratioDBPM       0      501 501     -0.55       2.68   -20.5 
#>         ratioDWS       0      501 501      1.02       0.98    -0.4 
#>        ratioOBPM       0      501 501     -1.4        4.3    -36.1 
#>         ratioOWS       0      501 501      1.1        1.66    -2.7 
#>         ratioPER       0      501 501     12.99       7.62   -38   
#>        ratioVORP       0      501 501      0.51       1.18    -1.9 
#>          ratioWS       0      501 501      2.12       2.44    -1.6 
#>     ratioWSPer48       0      501 501      0.076      0.12    -0.95
#>       yearSeason       0      501 501   2018          0     2018   
#>  yearSeasonFirst      15      486 501   2013.49       4.36  1989   
#>        p25       p50         p75       p100     hist
#>     23        25          29          42    ▃▇▇▆▃▁▁▁
#>     26        50          64          72    ▃▂▂▂▂▃▅▇
#>  2e+05     2e+05     1628383     1629541    ▇▁▁▁▁▁▁▇
#>    292       945        1672        2671    ▇▃▃▃▃▃▃▁
#>      0.23      0.38        0.52        0.9  ▇▃▇▇▇▅▂▁
#>      0.068     0.11        0.18        0.59 ▆▇▃▂▁▁▁▁
#>      0.012     0.024       0.2         0.9  ▇▁▁▁▁▁▁▁
#>      0.1       0.14        0.19        0.92 ▅▇▂▁▁▁▁▁
#>      0.15      0.23        0.32        0.86 ▃▇▇▃▂▁▁▁
#>      0.02      0.034       0.074       1    ▇▁▁▁▁▁▁▁
#>      0.013     0.016       0.024       0.9  ▇▁▁▁▁▁▁▁
#>      0.095     0.12        0.15        0.5  ▂▇▆▁▁▁▁▁
#>      0.062     0.087       0.13        0.52 ▅▇▃▁▁▁▁▁
#>      0.5       0.55        0.58        0.83 ▁▁▁▁▃▇▁▁
#>      0.15      0.18        0.22        0.47 ▁▁▇▆▂▁▁▁
#>     -3.9      -1.4         0.6        27.2  ▁▁▁▁▃▇▁▁
#>     -1.7      -0.5         0.9         8.6  ▁▁▁▁▂▇▂▁
#>      0.2       0.8         1.5         5    ▇▇▆▃▂▁▁▁
#>     -2.8      -1.1         0.3        22.3  ▁▁▁▁▇▂▁▁
#>      0         0.5         1.7         9.1  ▁▇▆▂▂▁▁▁
#>      9.4      12.6        16.5        80.8  ▁▁▁▇▁▁▁▁
#>     -0.1       0.1         0.8         8.1  ▁▇▂▁▁▁▁▁
#>      0.2       1.3         3.1        13    ▆▇▅▂▁▁▁▁
#>      0.039     0.081       0.13        1.26 ▁▁▁▇▂▁▁▁
#>   2018      2018        2018        2018    ▁▁▁▇▁▁▁▁
#>   2011      2015        2017        2018    ▁▁▁▁▁▂▃▇

Now we can filter and munge as needed:


adv_player_stats <- bref_advanced %>%
  filter(minutes >= 500) %>%
  mutate(bref_url = glue::glue("https://www.basketball-reference.com/players/{stringr::str_sub(idPlayer, 1, 1)}/{idPlayer}.html"),
         bref_link = glue::glue('<a href="{bref_url}">{namePlayer}</a>'))

Collapse positions into front and backcourt:


unique_positions <- unique(bref_advanced$idPosition)
frontcourt <- c("PF", "SF", "C", "PF-SF", "C-PF", "SG-PF", "SF-PF")
backcourt <- c("PG", "SG", "PG-SG", "SG-PG", "SF-SG", "SG-SF")

bref_efg <- bref_totals %>%
  select(one_of(c("idPlayer", "pctEFG")))

adv_player_stats <- adv_player_stats %>%
  left_join(bref_efg, by = "idPlayer") %>%
  mutate( "position" = case_when(
    idPosition %in% frontcourt ~ "frontcourt",
    idPosition %in% backcourt ~ "backcourt",
    TRUE ~ "other"),
    "position" = as.factor(position)
  )

Let’s also get some info from the NBA Stats API using teams_players_states(). By using assign_to_environment = TRUE, we’ll automatically get a data frame dataGeneralPlayers. For now I just want players’ offensive rating1, ortg, and defensive rating2, drtg.


nbastatR::teams_players_stats(seasons = 2019, types = c("player"), 
                              tables = "general", measures = "Advanced",
                              assign_to_environment = TRUE)

player_rtgs <- dataGeneralPlayers %>%
  select(one_of(c("idPlayer", "ortg", "drtg")))

adv_player_stats <- adv_player_stats %>%
  left_join(player_rtgs, by = c("idPlayerNBA" = "idPlayer"))

adv_player_stats %>%
  ggplot(aes(x = ratioPER)) +
  geom_histogram()

Let’s get some help from glue and hrbrthemes


adv_player_stats %>%
  ggplot(aes(x = ratioPER)) +
  geom_histogram(alpha = 0.7, fill = "#011627") +
  labs(title = "PER for players with 500+ minutes",
       subtitle = "NBA 2018-2019 season",
       caption = glue::glue("data via nbastatR {yesterday}")) +
  hrbrthemes::theme_ipsum_rc()


adv_player_stats %>%
  ggplot(aes(x = ratioVORP)) +
  geom_histogram(alpha = 0.7, fill = "#011627") +
  labs(title = "Value Over Replacement Player (VORP)",
       subtitle = "NBA 2018-2019 season, players with 500+ minutes",
       caption = glue::glue("data via nbastatR {yesterday}")) +
  hrbrthemes::theme_ipsum_rc()


adv_player_stats %>%
  ggplot(aes(x = ratioWS)) +
  geom_histogram(alpha = 0.7, fill = "#011627") +
  labs(title = "Win Shares for players with 500+ minutes",
       subtitle = "NBA 2018-2019 season",
       caption = glue::glue("data via nbastatR {yesterday}")) +
  hrbrthemes::theme_ipsum_rc()

Histograms are all well and good, but let’s look at something a little more interesting…


adv_player_stats %>%
  ggplot(aes(x = ratioOBPM, y = ratioDBPM)) +
  geom_point() +
  geom_hline(yintercept = 0, alpha = 0.6, lty = "dashed") +
  geom_vline(xintercept = 0, alpha = 0.6, lty = "dashed") +
  labs(title = "Offensive vs. Defensive Box Plus-Minus",
       subtitle = glue::glue("NBA 2018-2019 season through {yesterday}"),
       caption = glue::glue("data via nbastatR"),
       x = "OBPM",
       y = "DBPM") +
  hrbrthemes::theme_ipsum_rc()

Things are a pretty boring without annotation — and we’re not doing much in the way of storytelling. Luckily Hiroaki Yutani’s gghighlight package can help us out with that!

Because gghighlight uses a predicate function to determine what to highlight, I’ll make a little helper fun to get the top 10 players for some variable.


get_top10 <- function(df, column) {
  require(rlang)
  column <- enquo(column)
  dplyr::top_n(df, n = 10, wt = !!column) %>%
    pull(namePlayer)
}

Things are looking a little more complex, so let’s look at the pieces of code in this next section.


# get top 10 for desired variable (in this case ratioBPM)
top10_BPM <- top_n(adv_player_stats, n = 10, wt = ratioBPM) %>%
  pull(namePlayer)

adv_player_stats %>%
  ggplot(aes(x = ratioOBPM, y = ratioDBPM)) +
  geom_point(color = "#011627") +
  gghighlight::gghighlight(namePlayer %in% top10_BPM, label_key = namePlayer,
                           label_params = list(fill = ggplot2::alpha("white", 0.8),
                                                 box.padding = 0,
                                                 family = "Roboto Condensed"),
                           unhighlighted_colour = "#007190") +
  geom_hline(yintercept = 0, alpha = 0.6, lty = "dashed") +
  geom_vline(xintercept = 0, alpha = 0.6, lty = "dashed") +
  labs(title = "Offensive vs. Defensive Box Plus-Minus: Top 10 Box Plus/Minus",
       subtitle = glue::glue("NBA 2018-2019 season through {yesterday}"),
       caption = glue::glue("data via nbastatR"),
       x = "OBPM",
       y = "DBPM") +
  hrbrthemes::theme_ipsum_rc()

Predicate functions won’t always hit everything you want to see, which is why interactive visualizations can be a great tool for exploration. There are also some widgets and add-ins in RStudio that can help out with this.3

Since we’re using distill for R Markdown, we have some nice options in terms of figure layout. Below, I’ll use layout="l-body-outset" as a chunk parameter.


top10_WS <- get_top10(adv_player_stats, ratioWSPer48)

adv_player_stats %>%
  ggplot(aes(x = ratioOWS, y = ratioDWS)) +
  geom_point(color = "#011627") +
  gghighlight::gghighlight(namePlayer %in% top10_WS, label_key = namePlayer,
                           label_params = list(fill = ggplot2::alpha("white", 0.8),
                                                 box.padding = 0,
                                                 family = "Roboto Condensed"),
                           unhighlighted_colour = "#007190") +
  geom_hline(yintercept = 0, alpha = 0.6, lty = "dashed") +
  geom_vline(xintercept = 0, alpha = 0.6, lty = "dashed") +
  labs(title = "Offensive vs. Defensive Win Shares: Top 10 WS Per 48",
       subtitle = glue::glue("NBA 2018-2019 season through {yesterday}"),
       caption = glue::glue("data via nbastatR"),
       x = "OWS",
       y = "DWS") +
  hrbrthemes::theme_ipsum_rc()

We can go even wider by using layout="l-page".


top10_EFG <- get_top10(adv_player_stats, pctEFG)

adv_player_stats %>%
  ggplot(aes(x = (pctTrueShooting - mean(pctTrueShooting)), y = (ratioPER - mean(ratioPER)))) +
  geom_point(color = "#011627") +
  gghighlight::gghighlight(namePlayer %in% top10_EFG, label_key = namePlayer,
                           label_params = list(fill = ggplot2::alpha("white", 0.8),
                                                 box.padding = 0,
                                                 family = "Roboto Condensed"),
                           unhighlighted_colour = "#007190") +
  geom_hline(yintercept = 0, alpha = 0.6, lty = "dashed") +
  geom_vline(xintercept = 0, alpha = 0.6, lty = "dashed") +
  labs(title = "TS% above avg vs. PER above avg: Top 10 EFG%",
       subtitle = glue::glue("NBA 2018-2019 season"),
       caption = glue::glue("data via nbastatR, {yesterday}"),
       x = "true shooting %",
       y = "player efficiency rating") +
  hrbrthemes::theme_ipsum_rc()

Piping through the grammar of tables…

One of my latest favorite packages to play with is Rich Iannone’s {gt}:


adv_player_stats %>%
  select(namePlayer, ratioBPM, ratioOBPM, ratioDBPM, bref_url, urlPlayerThumbnail) %>%
  arrange(desc(ratioBPM)) %>%
  top_n(n = 10, wt = ratioBPM) %>%
  gt::gt(rowname_col = "namePlayer") %>%
  tab_header(
    title = md("**Top 10 Box Plus/Minus**")
  ) %>%
  cols_label(
    ratioBPM = md("**BPM**"),
    ratioOBPM = md("**OBPM**"),
    ratioDBPM = md("**DBPM**"),
    bref_url = md("**Link**"),
    urlPlayerThumbnail = md("")
  ) %>%
  text_transform(
    locations = cells_data(vars(bref_url)),
    fn = function(x) {
      sprintf("<a href=%s>profile</a>", x)
    }
  ) %>%
    text_transform(
    locations = cells_data(vars(urlPlayerThumbnail)),
    fn = function(x) {
      web_image(url = x) 
    }
  ) %>%
  tab_source_note(
    md("source: [basketball-reference.com](https://www.basketball-reference.com) via [nbastatR](http://asbcllc.com/nbastatR/index.html)")
  ) %>%
  tab_footnote(
    footnote = ("Players with 500+ minutes."),
    locations = cells_title("title")
    ) %>%
  tab_footnote(
    footnote = ("Box Plus/Minus: a box score estimate of the points per 100 possessions that a player contributed above a league-average player, translated to an average team."),
    locations = cells_column_labels(
      columns = vars(ratioBPM)
    )
  ) %>%
  tab_footnote(
    footnote = ("Offensive Box Plus/Minus."),
    locations = cells_column_labels(
      columns = vars(ratioOBPM)
    )
  ) %>%
  tab_footnote(
    footnote = ("Defensive Box Plus/Minus."),
    locations = cells_column_labels(
      columns = vars(ratioDBPM)
    )
  ) %>%
  tab_options(footnote.glyph = c("*, †, ‡, §, ¶, ‖"),
              table.width = px(640))
Top 10 Box Plus/Minus*
BPM OBPM DBPM § Link
James Harden 11.0 10.1 0.8 profile
Giannis Antetokounmpo 10.8 5.7 5.1 profile
Nikola Jokic 9.8 6.2 3.6 profile
Anthony Davis 8.7 4.8 3.9 profile
LeBron James 7.9 6.2 1.7 profile
Karl-Anthony Towns 7.3 5.4 1.9 profile
Kyrie Irving 6.8 6.4 0.5 profile
Nikola Vucevic 6.8 3.2 3.6 profile
Rudy Gobert 6.8 1.7 5.1 profile
Mitchell Robinson 6.2 0.4 5.8 profile
source: basketball-reference.com via nbastatR
* Players with 500+ minutes.
Box Plus/Minus: a box score estimate of the points per 100 possessions that a player contributed above a league-average player, translated to an average team.
Offensive Box Plus/Minus.
§ Defensive Box Plus/Minus.



adv_player_stats %>%
  select(urlPlayerHeadshot, namePlayer, ratioBPM, ratioOBPM, ratioDBPM) %>%
  arrange(desc(ratioOBPM)) %>%
  top_n(n = 10, wt = ratioOBPM) %>%
  gt::gt() %>%
  tab_header(
    title = md("**Top 10 Offensive Box Plus/Minus**")
  ) %>%
  cols_label(
    namePlayer = md("**Player**"),
    urlPlayerHeadshot = md(""),
    ratioBPM = md("**BPM**"),
    ratioOBPM = md("**OBPM**"),
    ratioDBPM = md("**DBPM**")
  ) %>%
    text_transform(
    locations = cells_data(vars(urlPlayerHeadshot)),
    fn = function(x) {
      web_image(url = x) 
    }
  ) %>%
  tab_source_note(
    md("source: [basketball-reference.com](https://www.basketball-reference.com) via [nbastatR](http://asbcllc.com/nbastatR/index.html)")
  ) %>%
  tab_footnote(
    footnote = ("Players with 500+ minutes."),
    locations = cells_title("title")
  ) %>%
  tab_footnote(
    footnote = ("Box Plus/Minus; a box score estimate of the points per 100 possessions that a player contributed above a league-average player, translated to an average team."),
    locations = cells_column_labels(
      columns = vars(ratioBPM)
    )
  ) %>%
  tab_footnote(
    footnote = ("Offensive Box Plus/Minus."),
    locations = cells_column_labels(
      columns = vars(ratioOBPM)
    )
  ) %>%
  tab_footnote(
    footnote = ("Defensive Box Plus/Minus."),
    locations = cells_column_labels(
      columns = vars(ratioDBPM)
    )
  ) %>%
  tab_options(footnote.glyph = c("*, †, ‡, §, ¶, ‖"),
              table.width = px(640))
Top 10 Offensive Box Plus/Minus*
Player BPM OBPM DBPM §
James Harden 11.0 10.1 0.8
Stephen Curry 6.0 7.7 -1.6
Damian Lillard 5.5 6.6 -1.1
Kyrie Irving 6.8 6.4 0.5
LeBron James 7.9 6.2 1.7
Nikola Jokic 9.8 6.2 3.6
Giannis Antetokounmpo 10.8 5.7 5.1
Karl-Anthony Towns 7.3 5.4 1.9
Paul George 5.8 5.0 0.9
Anthony Davis 8.7 4.8 3.9
source: basketball-reference.com via nbastatR
* Players with 500+ minutes.
Box Plus/Minus; a box score estimate of the points per 100 possessions that a player contributed above a league-average player, translated to an average team.
Offensive Box Plus/Minus.
§ Defensive Box Plus/Minus.


adv_player_stats %>%
  select(namePlayer, ratioBPM, ratioOBPM, ratioDBPM) %>%
  arrange(desc(ratioDBPM)) %>%
  top_n(n = 10, wt = ratioDBPM) %>%
  gt::gt() %>%
  tab_header(
    title = md("**Top 10 Defensive Box Plus/Minus**")
  ) %>%
  cols_label(
    namePlayer = md("**Player**"),
    ratioBPM = md("**BPM**"),
    ratioOBPM = md("**OBPM**"),
    ratioDBPM = md("**DBPM**")
  ) %>%
  tab_source_note(
    md("source: [basketball-reference.com](https://www.basketball-reference.com) via [nbastatR](http://asbcllc.com/nbastatR/index.html)")
  ) %>%
  tab_footnote(
    footnote = ("Players with 500+ minutes."),
    locations = cells_title("title")
  ) %>%
  tab_footnote(
    footnote = ("Box Plus/Minus; a box score estimate of the points per 100 possessions that a player contributed above a league-average player, translated to an average team."),
    locations = cells_column_labels(
      columns = vars(ratioBPM)
    )
  ) %>%
  tab_footnote(
    footnote = ("Offensive Box Plus/Minus."),
    locations = cells_column_labels(
      columns = vars(ratioOBPM)
    )
  ) %>%
  tab_footnote(
    footnote = ("Defensive Box Plus/Minus."),
    locations = cells_column_labels(
      columns = vars(ratioDBPM)
    )
  ) %>%
  tab_options(footnote.glyph = c("*, †, ‡, §, ¶, ‖"),
              table.width = px(640))
Top 10 Defensive Box Plus/Minus*
Player BPM OBPM DBPM §
Mitchell Robinson 6.2 0.4 5.8
Nerlens Noel 3.8 -1.8 5.5
Giannis Antetokounmpo 10.8 5.7 5.1
Rudy Gobert 6.8 1.7 5.1
Myles Turner 3.2 -1.6 4.8
Russell Westbrook 6.1 2.1 4.1
Anthony Davis 8.7 4.8 3.9
Joakim Noah 1.9 -1.8 3.7
Mason Plumlee 4.0 0.3 3.7
Jusuf Nurkic 5.2 1.6 3.6
Nikola Jokic 9.8 6.2 3.6
Nikola Vucevic 6.8 3.2 3.6
source: basketball-reference.com via nbastatR
* Players with 500+ minutes.
Box Plus/Minus; a box score estimate of the points per 100 possessions that a player contributed above a league-average player, translated to an average team.
Offensive Box Plus/Minus.
§ Defensive Box Plus/Minus.

Highcharts

Messing around with highcharts courtesy of Joshua Kunst’s {highcharter} package.


library(highcharter)

hchart(adv_player_stats, "scatter", hcaes(x = "ratioOBPM", y = "ratioDBPM", group = "position", name = "namePlayer", OBPM = "ratioOBPM", DBPM = "ratioDBPM", position = "position")) %>%
  hc_tooltip(pointFormat = "<b>{point.name}</b><br />OBPM: {point.OBPM}<br />DBPM: {point.DBPM}") %>%
  hc_title(text = "Offensive vs. Defensive Box Plus/Minus") %>%
  hc_subtitle(text = "NBA 2018-2019 Season") %>%
  hc_credits(enabled = TRUE,
             text = "data via nbastatR",
             style = list(
               fontSize = "10px"
               )
             ) %>%
  hc_add_theme(hc_theme_538())

hchart(adv_player_stats, "scatter", hcaes(x = "ratioOWS", y = "ratioDWS", group = "position", name = "namePlayer", OWS = "ratioOWS", DWS = "ratioDWS", position = "position")) %>%
  hc_tooltip(pointFormat = "<b>{point.name}</b><br />OWS: {point.OWS}<br />DWS: {point.DWS}") %>%
  hc_title(text = "Offensive vs. Defensive Win Shares") %>%
  hc_subtitle(text = "NBA 2018-2019 Season") %>%
  hc_credits(enabled = TRUE,
             text = "data via nbastatR",
             style = list(
               fontSize = "10px"
               )
             ) %>%
  hc_add_theme(hc_theme_economist())

hchart(adv_player_stats, "scatter", 
       hcaes(x = "pctTrueShooting", y = "ratioPER",
             name = "namePlayer", TS = "pctTrueShooting", 
             PER = "ratioPER", position = "position")) %>%
  hc_tooltip(pointFormat = "<b>{point.name}</b><br />TS%: {point.TS}<br />PER: {point.PER}<br />Position: {point.position}") %>%
  hc_title(text = "True Shooting % vs Player Efficiency Rating") %>%
  hc_subtitle(text = "NBA 2018-2019 Season") %>%
  hc_credits(enabled = TRUE,
             text = "data via nbastatR",
             style = list(
               fontSize = "14px"
               )
             ) %>%
  hc_add_theme(hc_theme_chalk(
    plotOptions = list(
      scatter = list(
        marker = list(radius = 4,
                      fillOpacity = 0.3) # actually this does nothing
        )
      )
    )
  )