Skip to content

Instantly share code, notes, and snippets.

@chrishanretty
Created December 1, 2025 20:39
Show Gist options
  • Select an option

  • Save chrishanretty/d9f508c1b7e91e62a8e6e4d2421222c8 to your computer and use it in GitHub Desktop.

Select an option

Save chrishanretty/d9f508c1b7e91e62a8e6e4d2421222c8 to your computer and use it in GitHub Desktop.
Calculate actors' h-index
### All data from https://developer.imdb.com/non-commercial-datasets/
library(readr)
library(tidyverse)
princ <- read_tsv("title.principals.tsv.gz",
col_select = c("tconst", "nconst", "category")) |>
filter(category %in% c("actor", "actress")) |>
dplyr::select(-category)
### Now get the ratings of the titles
ratings <- read_tsv("title.ratings.tsv.gz")
princ <- left_join(princ,
ratings)
### Now get the runtimes
titles <- read_tsv("title.basics.tsv.gz",
col_select = c("tconst", "titleType", "primaryTitle", "runtimeMinutes")) |>
filter(titleType == "movie") |>
dplyr::select(-titleType) |>
mutate(runtimeMinutes = as.numeric(runtimeMinutes))
princ <- inner_join(princ, titles,
by = join_by(tconst))
### Start averaging
h_index = function(cites) {
if(max(cites) == 0) return(0) # assuming this is reasonable
cites = cites[order(cites, decreasing = TRUE)]
tail(which(cites >= seq_along(cites)), 1)
}
avg <- princ |>
filter(!is.na(averageRating)) |>
filter(!is.na(runtimeMinutes)) |>
filter(numVotes >= 1000) |>
group_by(nconst) |>
summarize(quality = sum(runtimeMinutes * averageRating),
h_index = h_index(floor(averageRating)))
avg <- avg |>
arrange(desc(h_index))
### Link to names
names <- read_tsv("name.basics.tsv.gz")
res <- left_join(avg |> filter(h_index == 8),
names,
by = join_by(nconst)) |>
arrange(desc(quality))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment