Text and string operations

`%notin%` <- function(x,y) !(x %in% y) # NOT IN FUNCTION!
ctrl_c <- function(x) {write.table(x, file="clipboard", row.names = F)} # COPY TO CLIPBOARD

Character correction

# REPLACE LETTERS WITH ACCENT TO PLAIN LETTER
accent_buster <- function(el){  
  output <- chartr("Á", "A", el)
  output <- chartr("É", "E", output)
  output <- chartr("Í", "I", output)
  output <- chartr("Ó", "O", output)
  output <- chartr("Ú", "U", output)
  output <- chartr("á", "a", output)
  output <- chartr("é", "e", output)
  output <- chartr("í", "i", output)
  output <- chartr("ó", "o", output)
  output <- chartr("ú", "u", output)
}
directors_ready <- accent_buster(directors_trim)

Fix Spanish accents

# Fixing accents from movie details data
fix_accents <- function(el){
  # check if input is a data frame
  if(is.data.frame(el) == FALSE){warning("input is not a data frame")}
    else{
      # coerce input as data frame class, removing potential data.table class
      el <- as.data.frame(el)
      # get columns and row numbers and header names from input 
      l <- ncol(el)
      r <- nrow(el)
      header_el <- colnames(el)
      # create output data frame
      result <- data.frame(matrix(NA, nrow = r, ncol = l))
      colnames(result) <- header_el
      for(i in 1:l){
        # if column is not character, copy to result data frame
       if(is.character(el[,i]) == FALSE){
         result[,i] <- el[,i]
       } else{
  # replace following characters
  output <- gsub("ñ", "ñ", el[,i], fixed = TRUE)
  output <- gsub("á", "a", output, fixed = TRUE)
  output <- gsub("i¡", "a", output, fixed = TRUE)
  output <- gsub("é", "e", output, fixed = TRUE)
  output <- gsub("i©", "e", output, fixed = TRUE)
  output <- gsub("Ã", "i", output, fixed = TRUE)
  output <- gsub("ó", "o", output, fixed = TRUE)
  output <- gsub("i³", "o", output, fixed = TRUE)
  output <- gsub("ú", "u", output, fixed = TRUE)
  output <- gsub("iº", "u", output, fixed = TRUE)
  output <- gsub("i¼", "u", output, fixed = TRUE)
  output <- gsub("¿", "¿", output, fixed = TRUE)
  output <- gsub("¡", "¡", output, fixed = TRUE)
  result[,i] <- output
       }
      }
    } 
  return(result)
}
movie_details_fixed <- fix_accents(movies_details)

Remove parenthesis from string elements

For instance: “Juan Batlle Planas (hijo)”, “Armando Bó (guionista)”, and so on.

Find elements

library(rebus)
library(stringr)
library(htmlwidgets)
str_view(directors_vfull, pattern = SPC %R% capture(OPEN_PAREN %R% one_or_more(WRD) %R% CLOSE_PAREN) %R% END, match = TRUE)

Remove and replace parenthesis elements

library(rebus)
library(stringr)
library(htmlwidgets)
directors_trim <- str_replace_all(directors_vfull, pattern = SPC %R% capture(OPEN_PAREN %R% one_or_more(WRD) %R% CLOSE_PAREN) %R% END, "")

Names: remove first name, leave only second name

# Short the name by trimming out first names
library(stringr)
for(i in 1:nrow(top_ten)){
  getLastName <- tail(strsplit(top_ten$director[i],split=" ")[[1]],1)
  top_ten$director[i] <- getLastName}
# Convert directors into factor variable to arrange the plot
fact_order <- unique(top_ten$director)
top_ten$director <- factor(top_ten$director, ordered = T, levels = fact_order)

Data management

Look up table (update variable values)

# Correcting city labels. Turning to character and removing first 0
eph2017$AGLOMERADO <- as.character(eph2017$AGLOMERADO)
# Look up table variable below
aglo_lut <- c("2" = "La Plata", "3" = "Bahía Blanca", "4" = "Rosario", "5" = "Santa Fé", "6" = "Paraná", "7" = "Posadas", "8" = "Resistencia", "9" = "Cdro Rivadav", "10" = "Mendoza", "12" = "Corrientes", "13" = "Córdoba", "14" = "Concordia", "15" = "Formosa", "17" = "Neuquén", "18" = "SdEstero", "19" = "Jujuy", "20" = "Río Gallegos", "22" = "Catamarca", "23" = "Salta", "25" = "La Rioja", "26" = "San Luis", "27" = "San Juan", "29" = "Tucumán", "30" = "Santa Rosa", "31" = "Ushuaia", "32" = "CABA", "33" = "Conurbano", "34" = "Mar del Plat", "36" = "Río Cuarto", "38" = "San Nicolás", "91" = "Rawson", "93" = "Viedma")
# Apply look up table
eph2017$AGLOMERADO <- aglo_lut[eph2017$AGLOMERADO]

Show full tibble (dplyr)

print(tbl_df(byCountry_tile), n=46)
# n is how many rows does the tibble contain

Reorder a categorical variable

Defining categorical by higher to lower frequency (before plotting)

byCountry_b <-travelPeopleRaw %>%
  group_by(Nationality) %>%
  tally() %>%
  arrange(desc(n))  # Order data by frequency
# Define levels by frequency
countryLevels <- unique(byCountry_b$Nationality)
# Define factor
byCountry_b$Nationality <- factor(byCountry_b$Nationality, ordered = T, levels = countryLevels)

ggplot(byCountry_b, aes(x = Nationality, y = n)) + geom_bar(stat="identity") +
  coord_flip()

Reorder variable IN a plot

ggplot(who_subset, aes(x = log10(cases_1992), y = reorder(country, cases_1992))) +
    geom_point()

Check missing (NA) values in a dataset

co2_NA <- co2_pc_raw %>%
  select(c(2:216)) %>%  # GET YEAR VARIABLES
  t() %>%               # TRANSPOSE YEARS AS ROWS
  as.data.frame %>%
  is.na                 # GET NAs

Remove missing values

### REMOVE COUNTRIES WITH TOO MANY NAs
co2_removeNA <- co2_1950_NA %>%  # DEFINE COUNTRIES TO BE REMOVED
  arrange(desc(n_na)) %>%
  filter(n_na > 7)
co2_pc_1950_filter <- co2_pc_1950 %>% # FILTER THEM OUT
  filter(!country %in% co2_removeNA[,1])

Spread uneven key-values into rows

Given a dataset with key-value pairs, where keys differ in the number of values they have, this function will gather keys as rows (one row per key) and sort values for every key.

spread_data <- function(data, key, value, numchar){
  # data is a dataset
  # key is the variable to fetch   ### MANAGE key AND value AS COLUMN INDEX?
  # value is the values to be gathered
  # numchar is the total length for every value
  #####
  library(tidyr)
  library(dplyr)
  names <- colnames(data)
  key_name <- names[key]
  value_name <- names[value]
  u <- length(unique(data[,key])) # how many uniques (index)
  w <- unique(data[,key]) # vector with uniques
  #creating output element
  output <- data.frame(key_name = character(), value_name = character())
  for(i in 1:u){
    # get how many elements are in the given key
    nk <- w[i]  # get an unique key
    n_index <- which(data[,key] == nk)  # index of rows with key == nk
    #n <- length(n_index)  # number of rows with nk
    #
    selection <- data[which(data[,key] == nk),]
    selection <- selection[,c(key, value)]
    s <- spread(selection, value_name, key_name) # spread data as column names
    vals <- colnames(s) # get values vector
    new_row <- cbind(nk, paste(vals, collapse = ", "))  # create new_row to append to output
    output <- rbind(output, new_row)
    rm(new_row)
  }
  colnames(output) <- c(key_name, value_name)
  #reconvert output element as a character only data frame
  output <- data.frame(lapply(output, as.character), stringsAsFactors = FALSE)
  # figure out number of columns to add in output data frame
  new_vals <- as.character(pull(output, 2))
  val_cols <- gsub(", ", "", new_vals) # remove separating commas
  num_new_cols <- max(nchar(val_cols))/numchar # count max number of values
  for(i in 1:num_new_cols){
    output[,(i+2)] <- NA # add as many new column as max number of values
  }
  # split new values into a list, one sublist for every new row
  l_nv <- strsplit(new_vals, ", ", fixed = TRUE)
  o_length <- nrow(output)
  for(i in 1:o_length){
    for(j in 1:num_new_cols){
      output[i, (j + 2)] <- l_nv[[i]][j]
    }
  }
  output[,2] <- NULL #dumping long string source
  # setting new column names for output element
  new_cols_name <- c(colnames(output)[1])
  for(i in 1:num_new_cols){
    new_cols_name <- c(new_cols_name, paste(value_name, "_", i, sep = ""))
  }
  colnames(output) <- new_cols_name
  return(output)
}

cast_fix <- spread_data(cast_filter, 1, 3, 9)

Bin continuous variable into ordinal variable (CO2 emission rankings)

### CLASSIFYING COUNTRIES
differenceTrend <- function(d){
  output <- as.character()
  for(i in 1:nrow(d)){
  if(is.na(d[i, difference])){output <- c(output, "N/A")}
    else if(d[i, difference] < 0.51){output <- c(output, "Below Var")}
    else if(d[i, difference] < 3){output <- c(output, "Normal Var")}
    else(output <- c(output, "Above Var"))
  }
  return(output)
}
varProfile <- differenceTrend(co2_y2y_var)
# Factorize values
varProfile <- factor(varProfile, c("Below Var", "Normal Var", "Above Var", "N/A"))

Bin data points by decade (Argentine movie)

decade_sampling <- function(data, key, yVar, yStart = min(data[,yVar]), yEnd = max(data[,yVar]), sample_prop = 0.33){
  # data is a dataset
  # key is the data$key element to gather - a 
  # yVar is yearIndex
  # yStart AND yEnd are years
  # sample_prop is the proportion of rows to take from each decade "0.10"
  ## Check if year variables is numeric or integer
  if((is.numeric(data[,yVar]) | is.integer(data[,yVar])) == FALSE){
    print("yVar element is not numeric or integer. Fix it.")
  }
  ## Define the decade span
  hm_decades <- (((yEnd -yStart) - ((yEnd -yStart) %% 10)) / 10) + 1
  decades_floor <- yStart - (yStart %% 10)
  decades_ceiling <- yEnd + (10 - yEnd %% 10)
  ## Group the years in decades
  decades_vector <- character()
  for(i in 1:hm_decades){
    j <- i -1
    d_floor <- decades_floor + (j * 10)
    d_ceiling <- decades_floor + (i * 10)
    decades_vector <- c(decades_vector, paste(d_floor, "-", d_ceiling))
  }
  ## Create the output element
  output <- as.data.frame(data[1,])
  #output <- data.frame(key_name = character(), decade = factor(decades_vector, levels = length(decades_vector)), year = integer())
  ## Subset dataset by decade
  for(i in 1:hm_decades){
    j <- i -1
    d_floor <- decades_floor + (j * 10)
    d_ceiling <- decades_floor + (i * 10)
    current_decade <- d_floor:d_ceiling
    decade_subset <- data[data[,yVar] %in% current_decade,]
    ## Get the sample
    decade_sample <- sample(decade_subset[,key], round(nrow(decade_subset)*sample_prop))
    sample_data <- data[data[,key] %in% decade_sample,]
    rm(decade_sample)
    ## Put all decades together
    output <- rbind(output, sample_data)
    rm(sample_data)
  }
  return(output)
}

test_set <- arg_movies[, c(1, 2, 3, 8)]
no_year <- which(test_set$releaseYear == "\\N")
test_set <- test_set[-no_year,]
test_set$releaseYear <- as.integer(test_set$releaseYear)
test_output <- decade_sampling(test_set, 1, 4, sample_prop = 0.05)
test_me <- test_output[test_output[,4] %in% c(1950:1960),]

Bin elements together (Argentinean movies in 5 year period)

# group Argentine movies in 5 year period
yFloor <- 1899
yCeiling <- 2019
ySpan <- (yCeiling - yFloor)/5
arg_movies_5year <- data.frame(year5 = as.integer(), total_awards = as.integer())
for(i in 1:ySpan){
  j <- i - 1
  current5y <- yFloor + (j * 5)
  y5 <- arg_movies_awards_year[(arg_movies_awards_year[,3] %in%  current5y:(current5y + 5)),]
  add_me <- c(current5y, sum(y5$total_awards))
  new_row <- nrow(arg_movies_5year) + 1
  arg_movies_5year[new_row, 1] <- current5y
  arg_movies_5year[new_row, 2] <- sum(y5$awards)
}

Categorize elements by percentile position

### DEFINE PROFILES
CO2_profiles <- co2_pc_1950_filter %>%
  mutate(earlyAvg = rowMeans(select(., 2:16), na.rm = TRUE), #1950-65
         earlyQ = ntile(earlyAvg, 4),                      # Quantile in distribution
         midAvg = rowMeans(select(., 17:41), na.rm = TRUE), #1966-90
         midQ = ntile(midAvg, 4),
         lateAvg = rowMeans(select(., 42:67), na.rm = TRUE), # 1991-2014
         lateQ = ntile(lateAvg, 4),
         changeMid = !(midQ == earlyQ),   # IS THE RANK THE SAME AS THE LAST PERIOD?
         changeLate = !(midQ == lateQ),
         changeE_L = !(lateQ == earlyQ)) %>%
  select(c(1, 68:ncol(.)))

Extract and create group of elements by quantile or percentile

### FIND COUNTRIES WITH HIGHEST HDI DURING 90
nineties <- colnames(HDI_filter[2:12])
countries90sTopHDI <-as.character()
  
HDI_filter <- as.data.frame(HDI_filter)
for(i in 1:length(nineties)){
  column <- i + 1
  p20HDI <- quantile(HDI_filter[,column], 0.8, na.rm = T) # extract top 20%
  Cp20HDI <- HDI_filter %>%
    select(country, column)
  Cp20HDI <- Cp20HDI[which(Cp20HDI[,2] > p20HDI),1]
  countries90sTopHDI <- c(countries90sTopHDI, Cp20HDI)
}
# CREATE GROUP
countries90sTopHDI <- as.data.frame(table(countries90sTopHDI))
# get countries with more than 8 presence in the top 20%
countries90sTopHDI <- countries90sTopHDI[which(countries90sTopHDI[2] > 8),1]
countries90sTopHDI <- as.character(countries90sTopHDI)

Decile analysis (CO2 changes over decile ranking)

CO2_profiles_decil <- co2_pc_1950_filter %>%
  mutate(d_1950 = ntile(`1950`, 10),
         d_1960 = ntile(`1960`, 10),
         d_1970 = ntile(`1970`, 10),
         d_1980 = ntile(`1980`, 10),
         d_1990 = ntile(`1990`, 10),
         d_2000 = ntile(`2000`, 10),
         d_2010 = ntile(`2010`, 10)) %>%
  select(country, d_1950, d_1960, d_1970, d_1980, d_1990, d_2000, d_2010)

CO2_profiles_decil$netSum <- computeRange(CO2_profiles_decil, c(2:(ncol(CO2_profiles_decil))), function(x) sum(diff(x)))

Categorize a variable based on another variable changes

Defining profiles by variation of CO2 emissions quartile variation

behavior_p <- function(d){
  if(
    d[3] == 1 & d[5] == 1 & d[7] == 1){ # LOW EMISSIONS NO CHANGE
    return(1)} else if(
      d[3] == 2 & d[5] == 2 & d[7] == 2 # MID-LOW NO CHANGE
    ){return(2)} else if(
      d[3] == 3 & d[5] == 3 & d[7] == 3
    ){return(3)} else if(
      d[3] == 4 & d[5] == 4 & d[7] == 4 # HIGH EMISSIONS NO CHANGE
    ){return(4)} else if(
      d[3] < d[7]        # EMISSIONS INCREASE = [[5]]
    ){return(5)} else if(
      d[3] > d[7]        # EMISSIONS DECREASE = [[6]]
    ){return(6)} else if(
      d[5] > d[7]       # VARIATION MID HIGH TO LOW
    ){return(7)} else if(
      d[5] < d[7]       # VARIATION MID LOW TO HIGH
    ){return(8)}else{return(9)}       # OTHER CASES
}

Adding the variable

# ADD BEHAVIOR
behavior <- apply(CO2_profiles, 1, behavior_p) # COMPUTE CATEGORIES
CO2_p_behavior <- CO2_profiles
CO2_p_behavior[, (ncol(CO2_p_behavior) + 1)] <- behavior # ADD TO NEW DF
colnames(CO2_p_behavior)[ncol(CO2_p_behavior)] <- "behavior"

Sample evenly a group

subset_sample <- function(d, cats){
  output <- data.frame()
  c <- length(unique(d[,cats]))
  for(i in 1:c){
    subgroup <- d[which(d[,cats] == i),]
    sample_i <- sample(subgroup[,1], round(nrow(subgroup)* 0.50))
    sub_sample <- d[which(d[,1] %in% sample_i), ]
    output <- rbind(output, sub_sample)
  }
  return(output)
}

Row-wise calculations over a defined range of columns

computeRange <- function(d, range, compute){
  # d is a data source *DATA FRAME*
  # range is a *VECTOR* column range / ex: c(1:5) OR c(2:(ncol(MY_DF) - 1))
  # compute is the computing *EXPRESSION* / ex: function(x) sum(diff(x))
  d <- as.data.frame(d)
  output <- numeric()
  for(r in 1:nrow(d)){
    getRow <- d[r,range]
    rowNums <- as.numeric(getRow)
    rowResult <- compute(rowNums)
    output <- c(output, rowResult)
  }
  return(output)
}
# Make a copy of the original DF
CO2_profiles_varStats <- CO2_p_behavior
# Create NEW COLUMNS, using the function
CO2_profiles_varStats$netSum <- computeRange(co2_pc_1950_filter, c(2:(ncol(co2_pc_1950_filter) - 1)), function(x) sum(diff(x)))
CO2_profiles_varStats$absSum <- computeRange(co2_pc_1950_filter, c(2:(ncol(co2_pc_1950_filter) - 2)), function(x) sum(Mod(diff(x))))
CO2_profiles_varStats$stDev <- computeRange(co2_pc_1950_filter, c(2:(ncol(co2_pc_1950_filter) - 3)), function(x) sd(x, na.rm = TRUE))

Applies

Update values

new_income <- numeric()
### Define function
correct_income <- function(old_var, new_var){
  if(old_var >= 99999){new_var <- 100000}
  else {new_var <- old_var}
}
### Result variable is the result of applying function above over source
new_income <- sapply(income, correct_income, new_var = new_income)

Apply means and bin elements together

#  COMPARING MEANS
mean_earlyXX <- lapply(earlyXX_awards, mean)
df_earlyXX <- as.data.frame(unlist(mean_earlyXX))
mean_midXX <- lapply(midXX_awards, mean)
df_midXX <- as.data.frame(unlist(mean_midXX))
mean_lateXX <- lapply(lateXX_awards, mean)
df_lateXX <- as.data.frame(unlist(mean_lateXX))
# Join them together
names(df_earlyXX)[1] <- "average_awards"
names(df_midXX)[1] <- "average_awards"
names(df_lateXX)[1] <- "average_awards"
sampleMeans_df <- rbind(df_earlyXX, df_midXX, df_lateXX)
# Add row names as year value
sampleMeans_df <- cbind(year5 = rownames(sampleMeans_df), sampleMeans_df)
sampleMeans_df$year5 <- as.character(sampleMeans_df$year5)
# Parse column as year number
library(stringr)
for(i in 1:nrow(sampleMeans_df)){
  getYear <- tail(strsplit(sampleMeans_df$year5[i],split="-")[[1]],1)
  sampleMeans_df$year5[i] <- getYear
sampleMeans_df$year5 <- as.integer(sampleMeans_df$year5)

Large datasets sourcing

Raw code

output <- fread("imdb/names.tsv", header = TRUE, nrows = 0)
h_names <- colnames(output)
filter_names <- function(){
  output <- fread("imdb/names.tsv", header = TRUE, nrows = 0)
  for(i in 1:9){
    j <- i - 1
    names_raw <- fread("imdb/names.tsv", nrows = 1000000, skip = (j * 1000000))
    colnames(names_raw) <- h_names
    # ADD COLUMN NAMES TO names_raw
    filter <- names_raw[which(names_raw$primaryName %in% directors_ready)]
    rm(names_raw)
    output <- rbind(output, filter, fill = TRUE)
    rm(filter)}
}

Function version

library(data.table)
filter_file <- function(source, iteration, reference, join_key){
  # source # file location "imdb/titles_info.tsv"
  # iteration # how many millions of rows are in the source
  # reference # a vector containing matches values to filter with
  # join_key # a string referring to the key variable between source and reference
  output <- fread(source, header = TRUE, nrows = 0)
  header_data <- colnames(output)
  for(i in 1:iteration){
    j <- i - 1
    data_raw <- fread(source, nrows = 1000000, skip = (j * 1000000))
    colnames(data_raw) <- header_data
    filter <- data_raw[which(data_raw[[join_key]] %in% reference)]
    rm(data_raw)
    output <- rbind(output, filter, fill = TRUE)
    rm(filter)}
  return(output)
}
arg_titles <- filter_file("imdb/crew.tsv", 6, argdirectors_imdb$nconst, "directors")

Web scraping

Obtain data from a site (Wikipedia category case)

library(rvest)
directors_output1 <- character() # Define vector outside of the loop
for(i in 1:13){  # Length is number of letters in website (see url variable)
  url <- "https://es.wikipedia.org/wiki/Categor%C3%ADa:Directores_de_cine_de_Argentina"
  directors_input <- url %>%
  read_html() %>%
  html_nodes(xpath = paste('/html/body/div[3]/div[3]/div[4]/div[2]/div[2]/div/div/div[', i, ']/ul')) %>% # xpath is the HTML location.
  html_text() # Obtain HTML text from the node
  # The result is a single string with a "\n" n between every name
  directors_split <- strsplit(directors_input, "\n")
  directors_output1 <- c(directors_output1, directors_split)
}

Loop over many sub-pages for data (IMDB awards case)

Ran on VM

# AWARD DATA SCRAPPER
imdb_scrapper <- function(data, key, new_var_name){
  # data = ELEMENT dataset name
  # key = NUMBER identifier column in data
  # new_var_name = STRING new name to add scrapped data
  library(rebus)
  library(stringr)
  library(htmlwidgets)
  library(data.table)
  library(httr)
  library(rvest)
  data <- as.data.frame(data)
  key_length <- length(unique(data[,key]))
  hector <- numeric()
  for(i in 1:key_length){  # Length is number of letters in website (see url variable)
    url <- paste0("https://www.imdb.com/title/", data[i, key], "/awards?ref_=tt_awd")
     input <- url %>%
      read_html() %>%
      html_nodes(xpath = '/html/body/div[2]/div/div[2]/div[3]/div[1]/div[1]/div/div[2]/div/div') %>%
      html_text() # Obtain HTML text from the node
    # Read input string and detect how many wins OR nominations
    if(length(input) == 0){
      hector <- c(hector, 0)
    } else {
     pat <- "(?<=all\\s)\\d+" # Detect digits after "all "
     # awards = (?<=all\\s)\\d+ // nominations "(?<=and\\s)\\d+"
     get_data <- as.numeric(str_extract(input, pat))
     hector <- c(hector, get_data)}
    # Add data to vector and loop over
  }
  # Add new column to data with name new_var_name
  nc <- ncol(data)
  data[,(nc + 1)] <- hector
  colnames(data)[nc + 1] <- new_var_name
  return(data)
}

Scrap the web for information from a sample

Awards and nominations wordwide sample case

imdb_data_assembler <- function(sample, key, yKey, pat){
  library(dplyr)
  library(rebus)
  library(stringr)
  library(htmlwidgets)
  library(data.table)
  library(httr)
  library(rvest)
  # Define the time span
  yFloor <- as.numeric(min(sample[,yKey]))
  yCeiling <- as.numeric(max(sample[,yKey]))
  ySpan <- (yCeiling - yFloor)/5
  # Create output element
  output <- list()
  # Filter movies within 5 year span
  for(i in 1:ySpan){
    j <- i - 1
    current5y <- yFloor + (j * 5)
    y5 <- sample[(sample[,4] %in%  current5y:(current5y + 5)),]
    # Scrap the data
    #get_data <- imdb_sample_scrapper(y5, key, pat)
    ####
    key_length <- length(unique(y5[,key]))
    hector <- numeric()
    for(a in 1:key_length){  # Length is number of letters in website (see url variable)
      url <- paste0("https://www.imdb.com/title/", y5[a, key], "/awards?ref_=tt_awd")
      try(input <- url %>%
            read_html() %>%
            html_nodes(xpath = '/html/body/div[2]/div/div[2]/div[3]/div[1]/div[1]/div/div[2]/div/div') %>%
            html_text())
      # Read input string and detect how many wins OR nominations
      try(if(length(input) == 0){
        hector <- c(hector, 0)
      } else {
        # awards = (?<=all\\s)\\d+ // nominations "(?<=and\\s)\\d+"
        get_data <- as.numeric(str_extract(input, pat))
        hector <- c(hector, get_data)})
      # Add data to vector and loop over
    }
    # Store the data
    ll <- length(output)
    output[[ll+1]] <- hector
    names(output)[ll+1] <- paste0(current5y,"-",(current5y + 5))
    }
  #######
    return(output)
  }
  
my_output <- imdb_data_assembler(sample = test_me, key = 1, yKey = 4, pat = "(?<=all\\s)\\d+")

Store results as lists; if data sets are too large, divide them in time periods and assemble together.

earlyXX <- decade_sampling (movies_worldwide, 1, 3, 1900, 1940, sample_prop = 0.25)
earlyXX_awards <- imdb_data_assembler(sample = earlyXX, key = 1, yKey = 3, pat =  "(?<=all\\s)\\d+")
#
midXX <- decade_sampling (movies_worldwide, 1, 3, 1940, 1980, sample_prop = 0.20)
midXX_awards <- imdb_data_assembler(sample = midXX, key = 1, yKey = 3, pat =  "(?<=all\\s)\\d+")
#
lateXX <- decade_sampling (movies_worldwide, 1, 3, 1980, 2020, sample_prop = 0.20)
lateXX_awards <- imdb_data_assembler(sample = lateXX, key = 1, yKey = 3, pat =  "(?<=all\\s)\\d+")

Statistic analysis

Chi Square function (morph data into x2 table to test)

HDI_filter <- as.data.frame(HDI_filter)
CO2redTest <- function(q, f = 15, baseY){
  # q is quantile # 0.75
  # f is frequency of a conuntry enough to admit it among high HDI country
  # baseY is the base year, which will take it as earliest and the following 4
  hHDIcountries <- as.character()
  for(i in 1:26){
    column <- i + 1
    pHDI <- quantile(HDI_filter[,column], q, na.rm = T) # extract by q%
    CpHDI <- HDI_filter %>%
      select(country, column)
    CpHDI <- CpHDI[which(CpHDI[,2] > pHDI),1]
    hHDIcountries <- c(hHDIcountries, CpHDI) # get a vector with countries, repeteaded for every year in the top q%
  }
  hHDIcountries <- as.data.frame(table(hHDIcountries))
  # get countries with as many 'f' presence in the top q%
  hHDIcountries <- hHDIcountries[which(hHDIcountries[2] > f),1]
  hHDIcountries <- as.character(hHDIcountries)
  y <- baseY - 1948 # Retransform base year
  CO2_reduct <- as.data.frame(co2_pc_1950_filter)
  # Get first average: base year + next 4.
  CO2_reduct$avg1 <- apply(CO2_reduct[,y:(y + 4)], 1, function(x) sum(x, na.rm = T)/5)
    CO2_reduct <- CO2_reduct %>%
    rowwise() %>% # APPLY CALCULATIONS BY ROWS
    mutate(avg2 = (sum(`2010`, `2011`, `2012`, `2013`, `2014`, na.rm = T))/5,
           CO2reduction = ifelse(avg2 < avg1, "Reduced CO2", "No reduction"), #CHECK IF REDUCED EMISSIONS
           HDI_level = ifelse(country %in% hHDIcountries, "High HDI", "Low HDI")) %>%
    select(country, avg1, avg2, CO2reduction, HDI_level)
    
    contT <- table(CO2_reduct$CO2reduction, CO2_reduct$HDI_level)
    #chiTest <- chisq.test(CO2_reduct$CO2reduction, CO2_reduct$HDI_level, correct = FALSE)
    #output <- list(contT, chiTest)
    return(contT)
}
c1990_80 <- CO2redTest(0.8, 15, 1990)
chisq.test(c1990_80, correct = FALSE)

Map plots

Basic plot

library(maps)
WorldData <- map_data('world')
WorldData %>% filter(region != "Antarctica") -> WorldData
WorldData <- fortify(WorldData)
map <- ggplot() + geom_map(data=WorldData, map=WorldData,
                           aes(x=long, y=lat, group=group, map_id=region),
                           fill="white", colour="#7f7f7f", size=0.5)

Colour a map as per categories (CO2 percentiles)

profileMap <-   map + scale_fill_continuous(low="green", high="darkred", guide="colorbar", labels = c("Low", "Mid-low", "Mid-High", "High")) + 
  guides(fill = guide_colourbar(barheight = 8))

profileMap + geom_map(data=CO2_profiles, map=WorldData,
                      aes(fill= earlyQ, map_id=country),
                      colour="#7f7f7f", size=0.5) +
  labs(title = "      CO2 Emission profiles ranking", subtitle ="      1950-1965", fill = "Profile group") +
  theme_void()   # Remove styles
ggsave("emissionProf_50_65.png")

Plot animation

library(animation)
Sys.setenv(PATH = paste("C:/PROGRA~1/ImageMagick-7.0.7-Q8",
                        Sys.getenv("PATH"), sep = ";"))
ani.options(convert = 'C:/PROGRA~1/ImageMagick-7.0.7-Q8/convert.exe')

### ORIGINAL TEST - IT WORKED!
boxplot_animate <- CO2_year_country %>%
  group_by(b_color)

boxplot_animate$year <- as.integer(boxplot_animate$year)

a_test <- ggplot(boxplot_animate, aes(x = b_color, y = CO2epc, frame = year)) +
  geom_boxplot() + transition_time(year)
animate(a_test)

Improved version

### IMPROVING THE TEST
animated_bp_profile <- ggplot(boxplot_animate, aes(x = b_color, y = CO2epc, frame = year, fill = b_color)) +
  geom_boxplot() + transition_time(year) +
  scale_y_continuous(limits = c(0,20)) + 
  scale_fill_manual(values = sort(unique(CO2_year_country$b_color)), breaks = CO2_year_country$behavior) +
  labs(title = "CO2 Emissions per capita by profiles",subtitle = 'Year: {frame_time}', x = 'Profile', y = 'CO2 Emission per capita')
animate(animated_bp_profile)

FINISH ON co2_dev_variation.R; going top-down in folder