`%notin%` <- function(x,y) !(x %in% y) # NOT IN FUNCTION!
ctrl_c <- function(x) {write.table(x, file="clipboard", row.names = F)} # COPY TO CLIPBOARD
# REPLACE LETTERS WITH ACCENT TO PLAIN LETTER
accent_buster <- function(el){
output <- chartr("Á", "A", el)
output <- chartr("É", "E", output)
output <- chartr("Í", "I", output)
output <- chartr("Ó", "O", output)
output <- chartr("Ú", "U", output)
output <- chartr("á", "a", output)
output <- chartr("é", "e", output)
output <- chartr("í", "i", output)
output <- chartr("ó", "o", output)
output <- chartr("ú", "u", output)
}
directors_ready <- accent_buster(directors_trim)
# Fixing accents from movie details data
fix_accents <- function(el){
# check if input is a data frame
if(is.data.frame(el) == FALSE){warning("input is not a data frame")}
else{
# coerce input as data frame class, removing potential data.table class
el <- as.data.frame(el)
# get columns and row numbers and header names from input
l <- ncol(el)
r <- nrow(el)
header_el <- colnames(el)
# create output data frame
result <- data.frame(matrix(NA, nrow = r, ncol = l))
colnames(result) <- header_el
for(i in 1:l){
# if column is not character, copy to result data frame
if(is.character(el[,i]) == FALSE){
result[,i] <- el[,i]
} else{
# replace following characters
output <- gsub("ñ", "ñ", el[,i], fixed = TRUE)
output <- gsub("á", "a", output, fixed = TRUE)
output <- gsub("i¡", "a", output, fixed = TRUE)
output <- gsub("é", "e", output, fixed = TRUE)
output <- gsub("i©", "e", output, fixed = TRUE)
output <- gsub("Ã", "i", output, fixed = TRUE)
output <- gsub("ó", "o", output, fixed = TRUE)
output <- gsub("i³", "o", output, fixed = TRUE)
output <- gsub("ú", "u", output, fixed = TRUE)
output <- gsub("iº", "u", output, fixed = TRUE)
output <- gsub("i¼", "u", output, fixed = TRUE)
output <- gsub("¿", "¿", output, fixed = TRUE)
output <- gsub("¡", "¡", output, fixed = TRUE)
result[,i] <- output
}
}
}
return(result)
}
movie_details_fixed <- fix_accents(movies_details)
For instance: “Juan Batlle Planas (hijo)”, “Armando Bó (guionista)”, and so on.
library(rebus)
library(stringr)
library(htmlwidgets)
str_view(directors_vfull, pattern = SPC %R% capture(OPEN_PAREN %R% one_or_more(WRD) %R% CLOSE_PAREN) %R% END, match = TRUE)
library(rebus)
library(stringr)
library(htmlwidgets)
directors_trim <- str_replace_all(directors_vfull, pattern = SPC %R% capture(OPEN_PAREN %R% one_or_more(WRD) %R% CLOSE_PAREN) %R% END, "")
# Short the name by trimming out first names
library(stringr)
for(i in 1:nrow(top_ten)){
getLastName <- tail(strsplit(top_ten$director[i],split=" ")[[1]],1)
top_ten$director[i] <- getLastName}
# Convert directors into factor variable to arrange the plot
fact_order <- unique(top_ten$director)
top_ten$director <- factor(top_ten$director, ordered = T, levels = fact_order)
# Correcting city labels. Turning to character and removing first 0
eph2017$AGLOMERADO <- as.character(eph2017$AGLOMERADO)
# Look up table variable below
aglo_lut <- c("2" = "La Plata", "3" = "Bahía Blanca", "4" = "Rosario", "5" = "Santa Fé", "6" = "Paraná", "7" = "Posadas", "8" = "Resistencia", "9" = "Cdro Rivadav", "10" = "Mendoza", "12" = "Corrientes", "13" = "Córdoba", "14" = "Concordia", "15" = "Formosa", "17" = "Neuquén", "18" = "SdEstero", "19" = "Jujuy", "20" = "Río Gallegos", "22" = "Catamarca", "23" = "Salta", "25" = "La Rioja", "26" = "San Luis", "27" = "San Juan", "29" = "Tucumán", "30" = "Santa Rosa", "31" = "Ushuaia", "32" = "CABA", "33" = "Conurbano", "34" = "Mar del Plat", "36" = "Río Cuarto", "38" = "San Nicolás", "91" = "Rawson", "93" = "Viedma")
# Apply look up table
eph2017$AGLOMERADO <- aglo_lut[eph2017$AGLOMERADO]
print(tbl_df(byCountry_tile), n=46)
# n is how many rows does the tibble contain
byCountry_b <-travelPeopleRaw %>%
group_by(Nationality) %>%
tally() %>%
arrange(desc(n)) # Order data by frequency
# Define levels by frequency
countryLevels <- unique(byCountry_b$Nationality)
# Define factor
byCountry_b$Nationality <- factor(byCountry_b$Nationality, ordered = T, levels = countryLevels)
ggplot(byCountry_b, aes(x = Nationality, y = n)) + geom_bar(stat="identity") +
coord_flip()
ggplot(who_subset, aes(x = log10(cases_1992), y = reorder(country, cases_1992))) +
geom_point()
co2_NA <- co2_pc_raw %>%
select(c(2:216)) %>% # GET YEAR VARIABLES
t() %>% # TRANSPOSE YEARS AS ROWS
as.data.frame %>%
is.na # GET NAs
### REMOVE COUNTRIES WITH TOO MANY NAs
co2_removeNA <- co2_1950_NA %>% # DEFINE COUNTRIES TO BE REMOVED
arrange(desc(n_na)) %>%
filter(n_na > 7)
co2_pc_1950_filter <- co2_pc_1950 %>% # FILTER THEM OUT
filter(!country %in% co2_removeNA[,1])
Given a dataset with key-value pairs, where keys differ in the number of values they have, this function will gather keys as rows (one row per key) and sort values for every key.
spread_data <- function(data, key, value, numchar){
# data is a dataset
# key is the variable to fetch ### MANAGE key AND value AS COLUMN INDEX?
# value is the values to be gathered
# numchar is the total length for every value
#####
library(tidyr)
library(dplyr)
names <- colnames(data)
key_name <- names[key]
value_name <- names[value]
u <- length(unique(data[,key])) # how many uniques (index)
w <- unique(data[,key]) # vector with uniques
#creating output element
output <- data.frame(key_name = character(), value_name = character())
for(i in 1:u){
# get how many elements are in the given key
nk <- w[i] # get an unique key
n_index <- which(data[,key] == nk) # index of rows with key == nk
#n <- length(n_index) # number of rows with nk
#
selection <- data[which(data[,key] == nk),]
selection <- selection[,c(key, value)]
s <- spread(selection, value_name, key_name) # spread data as column names
vals <- colnames(s) # get values vector
new_row <- cbind(nk, paste(vals, collapse = ", ")) # create new_row to append to output
output <- rbind(output, new_row)
rm(new_row)
}
colnames(output) <- c(key_name, value_name)
#reconvert output element as a character only data frame
output <- data.frame(lapply(output, as.character), stringsAsFactors = FALSE)
# figure out number of columns to add in output data frame
new_vals <- as.character(pull(output, 2))
val_cols <- gsub(", ", "", new_vals) # remove separating commas
num_new_cols <- max(nchar(val_cols))/numchar # count max number of values
for(i in 1:num_new_cols){
output[,(i+2)] <- NA # add as many new column as max number of values
}
# split new values into a list, one sublist for every new row
l_nv <- strsplit(new_vals, ", ", fixed = TRUE)
o_length <- nrow(output)
for(i in 1:o_length){
for(j in 1:num_new_cols){
output[i, (j + 2)] <- l_nv[[i]][j]
}
}
output[,2] <- NULL #dumping long string source
# setting new column names for output element
new_cols_name <- c(colnames(output)[1])
for(i in 1:num_new_cols){
new_cols_name <- c(new_cols_name, paste(value_name, "_", i, sep = ""))
}
colnames(output) <- new_cols_name
return(output)
}
cast_fix <- spread_data(cast_filter, 1, 3, 9)
### CLASSIFYING COUNTRIES
differenceTrend <- function(d){
output <- as.character()
for(i in 1:nrow(d)){
if(is.na(d[i, difference])){output <- c(output, "N/A")}
else if(d[i, difference] < 0.51){output <- c(output, "Below Var")}
else if(d[i, difference] < 3){output <- c(output, "Normal Var")}
else(output <- c(output, "Above Var"))
}
return(output)
}
varProfile <- differenceTrend(co2_y2y_var)
# Factorize values
varProfile <- factor(varProfile, c("Below Var", "Normal Var", "Above Var", "N/A"))
decade_sampling <- function(data, key, yVar, yStart = min(data[,yVar]), yEnd = max(data[,yVar]), sample_prop = 0.33){
# data is a dataset
# key is the data$key element to gather - a
# yVar is yearIndex
# yStart AND yEnd are years
# sample_prop is the proportion of rows to take from each decade "0.10"
## Check if year variables is numeric or integer
if((is.numeric(data[,yVar]) | is.integer(data[,yVar])) == FALSE){
print("yVar element is not numeric or integer. Fix it.")
}
## Define the decade span
hm_decades <- (((yEnd -yStart) - ((yEnd -yStart) %% 10)) / 10) + 1
decades_floor <- yStart - (yStart %% 10)
decades_ceiling <- yEnd + (10 - yEnd %% 10)
## Group the years in decades
decades_vector <- character()
for(i in 1:hm_decades){
j <- i -1
d_floor <- decades_floor + (j * 10)
d_ceiling <- decades_floor + (i * 10)
decades_vector <- c(decades_vector, paste(d_floor, "-", d_ceiling))
}
## Create the output element
output <- as.data.frame(data[1,])
#output <- data.frame(key_name = character(), decade = factor(decades_vector, levels = length(decades_vector)), year = integer())
## Subset dataset by decade
for(i in 1:hm_decades){
j <- i -1
d_floor <- decades_floor + (j * 10)
d_ceiling <- decades_floor + (i * 10)
current_decade <- d_floor:d_ceiling
decade_subset <- data[data[,yVar] %in% current_decade,]
## Get the sample
decade_sample <- sample(decade_subset[,key], round(nrow(decade_subset)*sample_prop))
sample_data <- data[data[,key] %in% decade_sample,]
rm(decade_sample)
## Put all decades together
output <- rbind(output, sample_data)
rm(sample_data)
}
return(output)
}
test_set <- arg_movies[, c(1, 2, 3, 8)]
no_year <- which(test_set$releaseYear == "\\N")
test_set <- test_set[-no_year,]
test_set$releaseYear <- as.integer(test_set$releaseYear)
test_output <- decade_sampling(test_set, 1, 4, sample_prop = 0.05)
test_me <- test_output[test_output[,4] %in% c(1950:1960),]
# group Argentine movies in 5 year period
yFloor <- 1899
yCeiling <- 2019
ySpan <- (yCeiling - yFloor)/5
arg_movies_5year <- data.frame(year5 = as.integer(), total_awards = as.integer())
for(i in 1:ySpan){
j <- i - 1
current5y <- yFloor + (j * 5)
y5 <- arg_movies_awards_year[(arg_movies_awards_year[,3] %in% current5y:(current5y + 5)),]
add_me <- c(current5y, sum(y5$total_awards))
new_row <- nrow(arg_movies_5year) + 1
arg_movies_5year[new_row, 1] <- current5y
arg_movies_5year[new_row, 2] <- sum(y5$awards)
}
### DEFINE PROFILES
CO2_profiles <- co2_pc_1950_filter %>%
mutate(earlyAvg = rowMeans(select(., 2:16), na.rm = TRUE), #1950-65
earlyQ = ntile(earlyAvg, 4), # Quantile in distribution
midAvg = rowMeans(select(., 17:41), na.rm = TRUE), #1966-90
midQ = ntile(midAvg, 4),
lateAvg = rowMeans(select(., 42:67), na.rm = TRUE), # 1991-2014
lateQ = ntile(lateAvg, 4),
changeMid = !(midQ == earlyQ), # IS THE RANK THE SAME AS THE LAST PERIOD?
changeLate = !(midQ == lateQ),
changeE_L = !(lateQ == earlyQ)) %>%
select(c(1, 68:ncol(.)))
### FIND COUNTRIES WITH HIGHEST HDI DURING 90
nineties <- colnames(HDI_filter[2:12])
countries90sTopHDI <-as.character()
HDI_filter <- as.data.frame(HDI_filter)
for(i in 1:length(nineties)){
column <- i + 1
p20HDI <- quantile(HDI_filter[,column], 0.8, na.rm = T) # extract top 20%
Cp20HDI <- HDI_filter %>%
select(country, column)
Cp20HDI <- Cp20HDI[which(Cp20HDI[,2] > p20HDI),1]
countries90sTopHDI <- c(countries90sTopHDI, Cp20HDI)
}
# CREATE GROUP
countries90sTopHDI <- as.data.frame(table(countries90sTopHDI))
# get countries with more than 8 presence in the top 20%
countries90sTopHDI <- countries90sTopHDI[which(countries90sTopHDI[2] > 8),1]
countries90sTopHDI <- as.character(countries90sTopHDI)
CO2_profiles_decil <- co2_pc_1950_filter %>%
mutate(d_1950 = ntile(`1950`, 10),
d_1960 = ntile(`1960`, 10),
d_1970 = ntile(`1970`, 10),
d_1980 = ntile(`1980`, 10),
d_1990 = ntile(`1990`, 10),
d_2000 = ntile(`2000`, 10),
d_2010 = ntile(`2010`, 10)) %>%
select(country, d_1950, d_1960, d_1970, d_1980, d_1990, d_2000, d_2010)
CO2_profiles_decil$netSum <- computeRange(CO2_profiles_decil, c(2:(ncol(CO2_profiles_decil))), function(x) sum(diff(x)))
Defining profiles by variation of CO2 emissions quartile variation
behavior_p <- function(d){
if(
d[3] == 1 & d[5] == 1 & d[7] == 1){ # LOW EMISSIONS NO CHANGE
return(1)} else if(
d[3] == 2 & d[5] == 2 & d[7] == 2 # MID-LOW NO CHANGE
){return(2)} else if(
d[3] == 3 & d[5] == 3 & d[7] == 3
){return(3)} else if(
d[3] == 4 & d[5] == 4 & d[7] == 4 # HIGH EMISSIONS NO CHANGE
){return(4)} else if(
d[3] < d[7] # EMISSIONS INCREASE = [[5]]
){return(5)} else if(
d[3] > d[7] # EMISSIONS DECREASE = [[6]]
){return(6)} else if(
d[5] > d[7] # VARIATION MID HIGH TO LOW
){return(7)} else if(
d[5] < d[7] # VARIATION MID LOW TO HIGH
){return(8)}else{return(9)} # OTHER CASES
}
Adding the variable
# ADD BEHAVIOR
behavior <- apply(CO2_profiles, 1, behavior_p) # COMPUTE CATEGORIES
CO2_p_behavior <- CO2_profiles
CO2_p_behavior[, (ncol(CO2_p_behavior) + 1)] <- behavior # ADD TO NEW DF
colnames(CO2_p_behavior)[ncol(CO2_p_behavior)] <- "behavior"
subset_sample <- function(d, cats){
output <- data.frame()
c <- length(unique(d[,cats]))
for(i in 1:c){
subgroup <- d[which(d[,cats] == i),]
sample_i <- sample(subgroup[,1], round(nrow(subgroup)* 0.50))
sub_sample <- d[which(d[,1] %in% sample_i), ]
output <- rbind(output, sub_sample)
}
return(output)
}
computeRange <- function(d, range, compute){
# d is a data source *DATA FRAME*
# range is a *VECTOR* column range / ex: c(1:5) OR c(2:(ncol(MY_DF) - 1))
# compute is the computing *EXPRESSION* / ex: function(x) sum(diff(x))
d <- as.data.frame(d)
output <- numeric()
for(r in 1:nrow(d)){
getRow <- d[r,range]
rowNums <- as.numeric(getRow)
rowResult <- compute(rowNums)
output <- c(output, rowResult)
}
return(output)
}
# Make a copy of the original DF
CO2_profiles_varStats <- CO2_p_behavior
# Create NEW COLUMNS, using the function
CO2_profiles_varStats$netSum <- computeRange(co2_pc_1950_filter, c(2:(ncol(co2_pc_1950_filter) - 1)), function(x) sum(diff(x)))
CO2_profiles_varStats$absSum <- computeRange(co2_pc_1950_filter, c(2:(ncol(co2_pc_1950_filter) - 2)), function(x) sum(Mod(diff(x))))
CO2_profiles_varStats$stDev <- computeRange(co2_pc_1950_filter, c(2:(ncol(co2_pc_1950_filter) - 3)), function(x) sd(x, na.rm = TRUE))
new_income <- numeric()
### Define function
correct_income <- function(old_var, new_var){
if(old_var >= 99999){new_var <- 100000}
else {new_var <- old_var}
}
### Result variable is the result of applying function above over source
new_income <- sapply(income, correct_income, new_var = new_income)
# COMPARING MEANS
mean_earlyXX <- lapply(earlyXX_awards, mean)
df_earlyXX <- as.data.frame(unlist(mean_earlyXX))
mean_midXX <- lapply(midXX_awards, mean)
df_midXX <- as.data.frame(unlist(mean_midXX))
mean_lateXX <- lapply(lateXX_awards, mean)
df_lateXX <- as.data.frame(unlist(mean_lateXX))
# Join them together
names(df_earlyXX)[1] <- "average_awards"
names(df_midXX)[1] <- "average_awards"
names(df_lateXX)[1] <- "average_awards"
sampleMeans_df <- rbind(df_earlyXX, df_midXX, df_lateXX)
# Add row names as year value
sampleMeans_df <- cbind(year5 = rownames(sampleMeans_df), sampleMeans_df)
sampleMeans_df$year5 <- as.character(sampleMeans_df$year5)
# Parse column as year number
library(stringr)
for(i in 1:nrow(sampleMeans_df)){
getYear <- tail(strsplit(sampleMeans_df$year5[i],split="-")[[1]],1)
sampleMeans_df$year5[i] <- getYear
sampleMeans_df$year5 <- as.integer(sampleMeans_df$year5)
output <- fread("imdb/names.tsv", header = TRUE, nrows = 0)
h_names <- colnames(output)
filter_names <- function(){
output <- fread("imdb/names.tsv", header = TRUE, nrows = 0)
for(i in 1:9){
j <- i - 1
names_raw <- fread("imdb/names.tsv", nrows = 1000000, skip = (j * 1000000))
colnames(names_raw) <- h_names
# ADD COLUMN NAMES TO names_raw
filter <- names_raw[which(names_raw$primaryName %in% directors_ready)]
rm(names_raw)
output <- rbind(output, filter, fill = TRUE)
rm(filter)}
}
library(data.table)
filter_file <- function(source, iteration, reference, join_key){
# source # file location "imdb/titles_info.tsv"
# iteration # how many millions of rows are in the source
# reference # a vector containing matches values to filter with
# join_key # a string referring to the key variable between source and reference
output <- fread(source, header = TRUE, nrows = 0)
header_data <- colnames(output)
for(i in 1:iteration){
j <- i - 1
data_raw <- fread(source, nrows = 1000000, skip = (j * 1000000))
colnames(data_raw) <- header_data
filter <- data_raw[which(data_raw[[join_key]] %in% reference)]
rm(data_raw)
output <- rbind(output, filter, fill = TRUE)
rm(filter)}
return(output)
}
arg_titles <- filter_file("imdb/crew.tsv", 6, argdirectors_imdb$nconst, "directors")
library(rvest)
directors_output1 <- character() # Define vector outside of the loop
for(i in 1:13){ # Length is number of letters in website (see url variable)
url <- "https://es.wikipedia.org/wiki/Categor%C3%ADa:Directores_de_cine_de_Argentina"
directors_input <- url %>%
read_html() %>%
html_nodes(xpath = paste('/html/body/div[3]/div[3]/div[4]/div[2]/div[2]/div/div/div[', i, ']/ul')) %>% # xpath is the HTML location.
html_text() # Obtain HTML text from the node
# The result is a single string with a "\n" n between every name
directors_split <- strsplit(directors_input, "\n")
directors_output1 <- c(directors_output1, directors_split)
}
Ran on VM
# AWARD DATA SCRAPPER
imdb_scrapper <- function(data, key, new_var_name){
# data = ELEMENT dataset name
# key = NUMBER identifier column in data
# new_var_name = STRING new name to add scrapped data
library(rebus)
library(stringr)
library(htmlwidgets)
library(data.table)
library(httr)
library(rvest)
data <- as.data.frame(data)
key_length <- length(unique(data[,key]))
hector <- numeric()
for(i in 1:key_length){ # Length is number of letters in website (see url variable)
url <- paste0("https://www.imdb.com/title/", data[i, key], "/awards?ref_=tt_awd")
input <- url %>%
read_html() %>%
html_nodes(xpath = '/html/body/div[2]/div/div[2]/div[3]/div[1]/div[1]/div/div[2]/div/div') %>%
html_text() # Obtain HTML text from the node
# Read input string and detect how many wins OR nominations
if(length(input) == 0){
hector <- c(hector, 0)
} else {
pat <- "(?<=all\\s)\\d+" # Detect digits after "all "
# awards = (?<=all\\s)\\d+ // nominations "(?<=and\\s)\\d+"
get_data <- as.numeric(str_extract(input, pat))
hector <- c(hector, get_data)}
# Add data to vector and loop over
}
# Add new column to data with name new_var_name
nc <- ncol(data)
data[,(nc + 1)] <- hector
colnames(data)[nc + 1] <- new_var_name
return(data)
}
Awards and nominations wordwide sample case
imdb_data_assembler <- function(sample, key, yKey, pat){
library(dplyr)
library(rebus)
library(stringr)
library(htmlwidgets)
library(data.table)
library(httr)
library(rvest)
# Define the time span
yFloor <- as.numeric(min(sample[,yKey]))
yCeiling <- as.numeric(max(sample[,yKey]))
ySpan <- (yCeiling - yFloor)/5
# Create output element
output <- list()
# Filter movies within 5 year span
for(i in 1:ySpan){
j <- i - 1
current5y <- yFloor + (j * 5)
y5 <- sample[(sample[,4] %in% current5y:(current5y + 5)),]
# Scrap the data
#get_data <- imdb_sample_scrapper(y5, key, pat)
####
key_length <- length(unique(y5[,key]))
hector <- numeric()
for(a in 1:key_length){ # Length is number of letters in website (see url variable)
url <- paste0("https://www.imdb.com/title/", y5[a, key], "/awards?ref_=tt_awd")
try(input <- url %>%
read_html() %>%
html_nodes(xpath = '/html/body/div[2]/div/div[2]/div[3]/div[1]/div[1]/div/div[2]/div/div') %>%
html_text())
# Read input string and detect how many wins OR nominations
try(if(length(input) == 0){
hector <- c(hector, 0)
} else {
# awards = (?<=all\\s)\\d+ // nominations "(?<=and\\s)\\d+"
get_data <- as.numeric(str_extract(input, pat))
hector <- c(hector, get_data)})
# Add data to vector and loop over
}
# Store the data
ll <- length(output)
output[[ll+1]] <- hector
names(output)[ll+1] <- paste0(current5y,"-",(current5y + 5))
}
#######
return(output)
}
my_output <- imdb_data_assembler(sample = test_me, key = 1, yKey = 4, pat = "(?<=all\\s)\\d+")
Store results as lists; if data sets are too large, divide them in time periods and assemble together.
earlyXX <- decade_sampling (movies_worldwide, 1, 3, 1900, 1940, sample_prop = 0.25)
earlyXX_awards <- imdb_data_assembler(sample = earlyXX, key = 1, yKey = 3, pat = "(?<=all\\s)\\d+")
#
midXX <- decade_sampling (movies_worldwide, 1, 3, 1940, 1980, sample_prop = 0.20)
midXX_awards <- imdb_data_assembler(sample = midXX, key = 1, yKey = 3, pat = "(?<=all\\s)\\d+")
#
lateXX <- decade_sampling (movies_worldwide, 1, 3, 1980, 2020, sample_prop = 0.20)
lateXX_awards <- imdb_data_assembler(sample = lateXX, key = 1, yKey = 3, pat = "(?<=all\\s)\\d+")
HDI_filter <- as.data.frame(HDI_filter)
CO2redTest <- function(q, f = 15, baseY){
# q is quantile # 0.75
# f is frequency of a conuntry enough to admit it among high HDI country
# baseY is the base year, which will take it as earliest and the following 4
hHDIcountries <- as.character()
for(i in 1:26){
column <- i + 1
pHDI <- quantile(HDI_filter[,column], q, na.rm = T) # extract by q%
CpHDI <- HDI_filter %>%
select(country, column)
CpHDI <- CpHDI[which(CpHDI[,2] > pHDI),1]
hHDIcountries <- c(hHDIcountries, CpHDI) # get a vector with countries, repeteaded for every year in the top q%
}
hHDIcountries <- as.data.frame(table(hHDIcountries))
# get countries with as many 'f' presence in the top q%
hHDIcountries <- hHDIcountries[which(hHDIcountries[2] > f),1]
hHDIcountries <- as.character(hHDIcountries)
y <- baseY - 1948 # Retransform base year
CO2_reduct <- as.data.frame(co2_pc_1950_filter)
# Get first average: base year + next 4.
CO2_reduct$avg1 <- apply(CO2_reduct[,y:(y + 4)], 1, function(x) sum(x, na.rm = T)/5)
CO2_reduct <- CO2_reduct %>%
rowwise() %>% # APPLY CALCULATIONS BY ROWS
mutate(avg2 = (sum(`2010`, `2011`, `2012`, `2013`, `2014`, na.rm = T))/5,
CO2reduction = ifelse(avg2 < avg1, "Reduced CO2", "No reduction"), #CHECK IF REDUCED EMISSIONS
HDI_level = ifelse(country %in% hHDIcountries, "High HDI", "Low HDI")) %>%
select(country, avg1, avg2, CO2reduction, HDI_level)
contT <- table(CO2_reduct$CO2reduction, CO2_reduct$HDI_level)
#chiTest <- chisq.test(CO2_reduct$CO2reduction, CO2_reduct$HDI_level, correct = FALSE)
#output <- list(contT, chiTest)
return(contT)
}
c1990_80 <- CO2redTest(0.8, 15, 1990)
chisq.test(c1990_80, correct = FALSE)
library(maps)
WorldData <- map_data('world')
WorldData %>% filter(region != "Antarctica") -> WorldData
WorldData <- fortify(WorldData)
map <- ggplot() + geom_map(data=WorldData, map=WorldData,
aes(x=long, y=lat, group=group, map_id=region),
fill="white", colour="#7f7f7f", size=0.5)
profileMap <- map + scale_fill_continuous(low="green", high="darkred", guide="colorbar", labels = c("Low", "Mid-low", "Mid-High", "High")) +
guides(fill = guide_colourbar(barheight = 8))
profileMap + geom_map(data=CO2_profiles, map=WorldData,
aes(fill= earlyQ, map_id=country),
colour="#7f7f7f", size=0.5) +
labs(title = " CO2 Emission profiles ranking", subtitle =" 1950-1965", fill = "Profile group") +
theme_void() # Remove styles
ggsave("emissionProf_50_65.png")
library(animation)
Sys.setenv(PATH = paste("C:/PROGRA~1/ImageMagick-7.0.7-Q8",
Sys.getenv("PATH"), sep = ";"))
ani.options(convert = 'C:/PROGRA~1/ImageMagick-7.0.7-Q8/convert.exe')
### ORIGINAL TEST - IT WORKED!
boxplot_animate <- CO2_year_country %>%
group_by(b_color)
boxplot_animate$year <- as.integer(boxplot_animate$year)
a_test <- ggplot(boxplot_animate, aes(x = b_color, y = CO2epc, frame = year)) +
geom_boxplot() + transition_time(year)
animate(a_test)
### IMPROVING THE TEST
animated_bp_profile <- ggplot(boxplot_animate, aes(x = b_color, y = CO2epc, frame = year, fill = b_color)) +
geom_boxplot() + transition_time(year) +
scale_y_continuous(limits = c(0,20)) +
scale_fill_manual(values = sort(unique(CO2_year_country$b_color)), breaks = CO2_year_country$behavior) +
labs(title = "CO2 Emissions per capita by profiles",subtitle = 'Year: {frame_time}', x = 'Profile', y = 'CO2 Emission per capita')
animate(animated_bp_profile)