In this markdown, the following dataset has been explored: * Hourly electric system operating in US by region * Employment rate by gender in Europe * Individuals using the internet % of population around the world
library(DBI)
library(RPostgres)
library(knitr)
library(data.table)
library(ggplot2)
set_wd <- function() {
library(rstudioapi) # make sure you have it installed
current_path <- getActiveDocumentContext()$path
setwd(dirname(current_path ))
print( getwd() )
}
The data illustrates supply and demand of electricity in hourly manner covering entire region in the US in the last week of Feb, 2018. (The unit is megawatthours) * Demand * Day-ahead demand forecast * Net generation * Total net actual interchange
# read csv file
dat1 <- fread("./source_lib/dat1/edit_us_hourly_electric_system_operating_data.csv")
# View the table
#head(dat1)
# Check briefly how the data is composed
table(dat1$megawatthours)
##
## California (region)
## 1
## Carolinas (region)
## 1
## Central (region)
## 1
## Day-ahead demand forecast
## 14
## Demand
## 14
## Electric Reliability Council of Texas
## 1
## Florida (region)
## 1
## Mid-Atlantic (region)
## 1
## Midwest (region)
## 1
## Net generation
## 14
## New England ISO (region)
## 1
## New York Independent System Operator (region)
## 1
## Northwest (region)
## 1
## Southeast (region)
## 1
## Southwest (region)
## 1
## Tennessee Valley Authority (region)
## 1
## Total net actual interchange
## 14
## United States Lower 48 (region)
## 1
# Extract region name (The region name is shown every 5 rows)
regions <- list(unique(dat1$megawatthours))
# Remove those that are not region name
regions <- unlist(regions)[-c(2,3,4,5)]
# Paste the region name for every 5 rows
dat1[, .region := rep(regions, each=5)]
# change column order
dat1<-dat1[,c(194, 1:193)]
# Check NA value
table(is.na(dat1))
##
## FALSE TRUE
## 10905 2675
library(Amelia)
missmap(dat1)
table(is.na(dat1))
##
## FALSE TRUE
## 10905 2675
# View dataset structure
#str(dat1)
# The third column is "chr", because one cell includes "Inc. (region)".
dat1[16,3] <- NA
# Since as.numeric(as.character(x)) still coerce NA, I used lapply to fix the problem.
dat1[,3] <- lapply(dat1[,3], function(x) as.numeric(as.character(x)))
# Transform the data
library(tidyr)
tidy.dat1 <- dat1[-c(seq(1, length(dat1), 5))] %>% # Remove NA lines
gather(timestamp, mega.watt.hours, 3:194) %>% # gather timestamp from column 3:194 and the value as mega.watt.hours
spread(megawatthours, mega.watt.hours) %>% # spread the value
separate(timestamp, c('hour', 'minute', 'est', 'month', 'day', 'year')) # separate the timestamp
# change colnames
colnames(tidy.dat1) <- c("region", "hour", "minute", "est", "month", "day", "year", "forecast", "demand", "supply", "net")
# Analysis
head(tidy.dat1)
## region hour minute est month day year forecast demand
## 1 California (region) 00 00 EST 2 19 2018 30744 30499
## 2 California (region) 00 00 EST 2 20 2018 33712 34720
## 3 California (region) 00 00 EST 2 21 2018 6032 35715
## 4 California (region) 00 00 EST 2 22 2018 35379 34930
## 5 California (region) 00 00 EST 2 23 2018 35068 35563
## 6 California (region) 00 00 EST 2 24 2018 33431 35330
## supply net
## 1 19443 -11054
## 2 25495 -9225
## 3 25776 -10193
## 4 24109 -11060
## 5 24038 -11524
## 6 22535 -12795
table(tidy.dat1$region)
##
## California (region)
## 192
## Carolinas (region)
## 192
## Central (region)
## 192
## Electric Reliability Council of Texas
## 192
## Florida (region)
## 192
## Mid-Atlantic (region)
## 192
## Midwest (region)
## 192
## New England ISO (region)
## 192
## New York Independent System Operator (region)
## 192
## Northwest (region)
## 192
## Southeast (region)
## 192
## Southwest (region)
## 192
## Tennessee Valley Authority (region)
## 192
## United States Lower 48 (region)
## 192
str(tidy.dat1)
## 'data.frame': 2688 obs. of 11 variables:
## $ region : chr "California (region)" "California (region)" "California (region)" "California (region)" ...
## $ hour : chr "00" "00" "00" "00" ...
## $ minute : chr "00" "00" "00" "00" ...
## $ est : chr "EST" "EST" "EST" "EST" ...
## $ month : chr "2" "2" "2" "2" ...
## $ day : chr "19" "20" "21" "22" ...
## $ year : chr "2018" "2018" "2018" "2018" ...
## $ forecast: num 30744 33712 6032 35379 35068 ...
## $ demand : num 30499 34720 35715 34930 35563 ...
## $ supply : num 19443 25495 25776 24109 24038 ...
## $ net : num -11054 -9225 -10193 -11060 -11524 ...
tidy.dat1 <- as.data.table(tidy.dat1)
library(plotly)
library(dplyr)
g1 <- tidy.dat1 %>%
mutate(mean = mean(net)) %>%
group_by(region, hour) %>%
summarize(mean=mean(net)) %>%
ggplot(aes(hour, mean)) +
geom_point(alpha = 0.3) +
geom_line(aes(group = region,
color = region)) +
labs(title = "mean of NET generation of Electricity per hour by region",
x = "hour", y = "Net (mean value)")
ggplotly(g1)
The visualization shows the mean of net generation of electiricty per hour by region for 1 week of time end of Feb, 2018. There are many states that had not met the needs: California, New York, Midwest, New England, Florida.
# Tidy up the data with new column called perc that shows supply/demand value, grouped by region and hour
tidy.dat1.perc <- tidy.dat1 %>%
mutate(perc = supply/demand) %>%
group_by(region, hour)
# create color column: if the perc is >= 1, 'above', else 'below'. We will use this in visualization.
tidy.dat1.perc$color[tidy.dat1.perc$perc >= 1] <- 'above'
tidy.dat1.perc$color[tidy.dat1.perc$perc < 1] <- 'below'
# visualization
g2 <- tidy.dat1.perc %>%
ggplot(aes(hour, perc,
color = color)) +
geom_line() +
geom_hline(yintercept = 1) +
scale_y_continuous(breaks = seq(0,2,0.1)) +
facet_wrap(~region, ncol = 5) +
labs(title = "supply/demand ratio per hour by region",
x = "hour", y = "supply/demand ratio") +
theme(axis.text.x = element_text(angle= 90, hjust=1),
legend.position= "bottom") +
scale_color_manual(values = c('above' = 'green', 'below' = 'red'))
g2
The visualization illustrates the supply/demand ratio of electricty by region. The y intercept(1) indicates that the electricty needs were sufficiently met. Distinctively, we can see that California is way below the capacity to meet demands followed by New England and New York. Midwest and Florida is slightly behind to meet the demands. Southeast shows interesting trend; The state appears to acknowledge that there are needs to meet between 18h00 and 20h00 to generate the electricity, an hour ahead.
The figures displays the % of population aged 20 ~ 64 that is employed. Employment in this definition means those that worked at least 1 hour for pay or profit or were temporarily absent from work. For this dataset, as the file was provided, divided in gender, I am going to label each sheet with gender m/f to display the employment rate over Europe by gender.
library(rJava)
library(xlsxjars)
library(xlsx)
# Read 2 excel file
dat2.f <- read.xlsx2("./source_lib/dat2/female.xlsx", "Sheet0", header = T)
dat2.m <- read.xlsx2("./source_lib/dat2/male.xlsx", "Sheet0", header = T)
# Add gender column
dat2.f$gender <- "female"
dat2.m$gender <- "male"
# rbind the table
dat2 <- rbind(dat2.f, dat2.m)
# wide to long format
tidy.dat2 <- dat2 %>% gather(year, perc, 2:18)
# this is written in the data that ":" is the null value.
table(tidy.dat2$perc == ":") # 1170, 88
##
## FALSE TRUE
## 1170 88
library(stringr)
tidy.dat2$year <- gsub(tidy.dat2$year, pattern ="X", replacement = "", fixed = T) # remove the X in year column
tidy.dat2 <- as.data.table(tidy.dat2)
tidy.dat2[perc == ":"]$perc <- NA # assign NA to null values
table(tidy.dat2$perc == ":") # 1170
##
## FALSE
## 1170
tidy.dat2[,4] <- lapply(tidy.dat2[,4], function(x) as.numeric(as.character(x))) # change the numeric column from character type to numeric
unique(tidy.dat2$geo.time) # check region names
## [1] EU (28 countries)
## [2] EU (27 countries)
## [3] Euro area (19 countries)
## [4] Euro area (18 countries)
## [5] Belgium
## [6] Bulgaria
## [7] Czech Republic
## [8] Denmark
## [9] Germany
## [10] Estonia
## [11] Ireland
## [12] Greece
## [13] Spain
## [14] France
## [15] Croatia
## [16] Italy
## [17] Cyprus
## [18] Latvia
## [19] Lithuania
## [20] Luxembourg
## [21] Hungary
## [22] Malta
## [23] Netherlands
## [24] Austria
## [25] Poland
## [26] Portugal
## [27] Romania
## [28] Slovenia
## [29] Slovakia
## [30] Finland
## [31] Sweden
## [32] United Kingdom
## [33] Iceland
## [34] Norway
## [35] Switzerland
## [36] Former Yugoslav Republic of Macedonia, the
## [37] Turkey
## 37 Levels: Austria Belgium Bulgaria Croatia Cyprus ... United Kingdom
tidy.dat2[geo.time == "Former Yugoslav Republic of Macedonia, the"]$geo.time <- "Macedonia"
# Divide the dataset by region and by countries
tidy.dat2.group <- tidy.dat2[geo.time == "EU (28 countries)" |
geo.time == "EU (27 countries)" |
geo.time == "Euro area (19 countries)" |
geo.time == "Euro area (18 countries)"]
tidy.dat2.countries <- tidy.dat2[geo.time != "EU (28 countries)" &
geo.time != "EU (27 countries)" &
geo.time != "Euro area (19 countries)" &
geo.time != "Euro area (18 countries)"]
str(tidy.dat2.group)
## Classes 'data.table' and 'data.frame': 136 obs. of 4 variables:
## $ geo.time: Factor w/ 38 levels "Austria","Belgium",..: 10 9 12 11 10 9 12 11 10 9 ...
## $ gender : chr "female" "female" "female" "female" ...
## $ year : chr "2000" "2000" "2000" "2000" ...
## $ perc : num NA 57.3 54.9 54.8 NA 76 76.3 76.4 57.9 57.9 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(tidy.dat2.group, aes(year, perc)) +
geom_point(alpha = 0.5, aes(fill = gender)) +
geom_line(aes(group = gender,
color = gender)) +
facet_wrap(~geo.time,ncol = 4) +
theme(axis.text.x = element_text(angle= 45, hjust=1),
legend.position= "bottom") +
labs(title = "Employment % in EU by group per year",
x = "year",
y = "employment %")
The graph above shows the employment rate between gender per year by regions in Europe. All of them shows the same trend. The part to notice is the lowest % of females in Euro Area (18,19 countries) was at 55% by 2000. As aggregated data isn’t telling us much of what we need, we can divide the data into countries to see whether there are specific countries that impacts the entire trend.
ggplot(tidy.dat2.countries, aes(year, perc)) +
geom_line(aes(group = gender,
color = gender)) +
facet_wrap(~geo.time, ncol = 11) +
theme(axis.text.x = element_text(angle= 90, hjust=1),
legend.position= "bottom") +
labs(title = "Employment % in EU by countries per year",
x = "year",
y = "employment %")
The graph above shows the employment rate between genders in countries in Europe. Each countries show distinctive pattern and some of the countries does not provide data for full time range until recent years (France, Switzerland, Turkey, Macedonia). It appears that the employment rate % of each gender shows pararell line unless both lines dropped to meet at point. Some of the countries shows significant progress (Austria, Belgium, Cyprus, Ireland, Luxembourg, Malta, Spain) as the gap between gender is becoming smaller compare to its rate in the past. Latvia, Lithuania shows that the employment gap between gender almost closed since 2010.
tidy.dat2.gap <- dcast(tidy.dat2.countries, geo.time + year ~ gender, sum) # Use the long data to wide data
tidy.dat2.gap[, gap := female-male] # mutate `gap` column that shows percentage of female - male
ggplot(tidy.dat2.gap, aes(year, gap)) +
geom_point(size = 1, alpha = 0.4) +
geom_line() +
facet_wrap(~geo.time, ncol = 11) +
theme(axis.text.x = element_text(angle= 90, hjust=1),
legend.position= "bottom")+
labs(title = "Employment % gap between gender in EU by countries per year",
x = "year",
y = "employment % gap between gender")
The visualization above shows the employment rate gap between gender each year by countries. Some of the countries shows rough slope (Spain, Malta, Luxembourg etc.) meaning that the countries gender gap of employment rate is improving to a degree.
The numbers show the % of population using the internet around the world from 1993 ~ 2016
# use fread to read the csv file in data table
dat3 <- fread("./source_lib/dat3/SDG_csv/SDGData.csv")
# read region csv file in data table
dat3.region <- fread("./source_lib/dat3/SDG_csv/SDGCountry.csv")
# extract the region names in Region column then remove the empty value on 5th.
dat3.region <- unlist(list((unique(dat3.region$Region))))[-5]
# filter the specific value: In this dataset we will use the internet usage population %
dat3 <- dat3[`Indicator Name` == 'Individuals using the Internet (% of population)']
# remove 2017 and V33 column
dat3[,c(32,33)] <- NULL
# make the wide format data into long format data gathering all years in year column together.
tidy.dat3 <- dat3 %>% gather(year, perc, 5:31)
tidy.dat3 <- as.data.table(tidy.dat3)
setkey(tidy.dat3)
# filter the data that has its `country name` in dat3.region (which is a string of unique region names)
tidy.dat3.region <- tidy.dat3[`Country Name` %in% dat3.region]
# change the perc column from character to numeric value
tidy.dat3.region[,6] <- lapply(tidy.dat3.region[,6], function(x) as.numeric(as.character(x)))
str(tidy.dat3.region)
## Classes 'data.table' and 'data.frame': 189 obs. of 6 variables:
## $ Country Name : chr "East Asia & Pacific" "East Asia & Pacific" "East Asia & Pacific" "East Asia & Pacific" ...
## $ Country Code : chr "EAS" "EAS" "EAS" "EAS" ...
## $ Indicator Name: chr "Individuals using the Internet (% of population)" "Individuals using the Internet (% of population)" "Individuals using the Internet (% of population)" "Individuals using the Internet (% of population)" ...
## $ Indicator Code: chr "IT.NET.USER.ZS" "IT.NET.USER.ZS" "IT.NET.USER.ZS" "IT.NET.USER.ZS" ...
## $ year : chr "1990" "1991" "1992" "1993" ...
## $ perc : num 0.00729 NA NA 0.07928 0.11738 ...
## - attr(*, ".internal.selfref")=<externalptr>
# visualization
ggplot(tidy.dat3.region, aes(year, perc,
group = `Country Name`,
fill = `Country Name`,
color = `Country Name`)) +
geom_point() +
geom_line() +
labs(title = "Individuals using the Internet (% of population) by region",
x = "Year",
y = "Internet usage % of population") +
theme(axis.text.x = element_text(angle= 90, hjust=1))
The graph above shows the % of population using the internet by region. By 2016, the population of North America that uses internet is close to 80%, followed by Europe. Latin America shows nearly 60% of the population uses internet, followed by East Asia and Middle east and North africa.