Define Libraries
library(tidyr) #for gather, drop
library(dplyr) #for glimpse
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Setting directory 작업 디렉토리 설정
setwd("C:/Users/chan0/Desktop/econ 390 assignment 1")
# Reading download data 데이터 불러오기기
data <- read.csv("annual-emissions.csv")
# Using head(data) 데이터의 6행까지 출력
head(data)
## State Facility.Name Facility.ID Year Gross.Load..MWh. Steam.Load..1000.lb.
## 1 AL Barry 3 2000 12787607 NA
## 2 AL Barry 3 2001 16137285 NA
## 3 AL Barry 3 2002 17931286 NA
## 4 AL Barry 3 2003 15475217 NA
## 5 AL Barry 3 2004 15516154 NA
## 6 AL Barry 3 2005 15471435 NA
## SO2.Mass..short.tons. CO2.Mass..short.tons. NOx.Mass..short.tons.
## 1 61331.11 12435918 23667.13
## 2 65902.38 13683576 23026.47
## 3 59524.80 14700456 24400.65
## 4 52706.28 13941717 22451.15
## 5 47470.36 13217576 19312.65
## 6 53652.94 13716973 19248.53
## Heat.Input..mmBtu.
## 1 124857223
## 2 147934446
## 3 162155771
## 4 146353476
## 5 140814946
## 6 143241269
# Checking for the name 열 이름 출력
colnames(data)
## [1] "State" "Facility.Name" "Facility.ID"
## [4] "Year" "Gross.Load..MWh." "Steam.Load..1000.lb."
## [7] "SO2.Mass..short.tons." "CO2.Mass..short.tons." "NOx.Mass..short.tons."
## [10] "Heat.Input..mmBtu."
# Select the data
selected_data <- data[, c("State", "Facility.Name", "Year", "Gross.Load..MWh.", "CO2.Mass..short.tons.")]
# Change Gross.Load..MWH. TO GrossLoad 열 이름 바꾸기
colnames(data)[colnames(data) == "Gross.Load..MWh."] <- "GrossLoad"
# Change CO2.Mass..short.tons. TO CO2Mass
colnames(data)[colnames(data) == "CO2.Mass..short.tons."] <- "CO2Mass"
# Display overview of the structure 간단한 구조를 보여주는 것
str(data)
## 'data.frame': 32517 obs. of 10 variables:
## $ State : chr "AL" "AL" "AL" "AL" ...
## $ Facility.Name : chr "Barry" "Barry" "Barry" "Barry" ...
## $ Facility.ID : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Year : int 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
## $ GrossLoad : num 12787607 16137285 17931286 15475217 15516154 ...
## $ Steam.Load..1000.lb. : num NA NA NA NA NA NA NA NA NA NA ...
## $ SO2.Mass..short.tons.: num 61331 65902 59525 52706 47470 ...
## $ CO2Mass : num 12435918 13683576 14700456 13941717 13217576 ...
## $ NOx.Mass..short.tons.: num 23667 23026 24401 22451 19313 ...
## $ Heat.Input..mmBtu. : num 1.25e+08 1.48e+08 1.62e+08 1.46e+08 1.41e+08 ...
# Using Gather to transform to long format
data_long <- gather(data,
key = "Measure", # create "measure" and add Grossload,CO2mass together
value = "Value", # the value output
GrossLoad, CO2Mass)
# Preview of the data set 데이터 구조 미리보기
glimpse(data_long)
## Rows: 65,034
## Columns: 10
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ Steam.Load..1000.lb. <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu. <dbl> 124857223, 147934446, 162155771, 146353476, 1408…
## $ Measure <chr> "GrossLoad", "GrossLoad", "GrossLoad", "GrossLoa…
## $ Value <dbl> 12787607, 16137285, 17931286, 15475217, 15516154…
# Using spread to undo from long format
data_wide <- spread(data_long, key = "Measure", value = "Value")
# Preview of the data set 데이터 구조 미리보기
glimpse(data_wide)
## Rows: 32,517
## Columns: 10
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ Steam.Load..1000.lb. <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu. <dbl> 124857223, 147934446, 162155771, 146353476, 1408…
## $ CO2Mass <dbl> 12435918, 13683576, 14700456, 13941717, 13217576…
## $ GrossLoad <dbl> 12787607, 16137285, 17931286, 15475217, 15516154…
# Select columns that include "load"?
selected_data2 <- data %>% select(-contains("Load"))
#Double checking for preview
glimpse(selected_data2)
## Rows: 32,517
## Columns: 8
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ CO2Mass <dbl> 12435918, 13683576, 14700456, 13941717, 13217576…
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu. <dbl> 124857223, 147934446, 162155771, 146353476, 1408…
clean_data <- drop_na(selected_data2) #delete rows with NA
glimpse(clean_data)
## Rows: 27,291
## Columns: 8
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ CO2Mass <dbl> 12435917.8, 13683576.3, 14700455.8, 13941716.7, …
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu. <dbl> 124857223, 147934446, 162155771, 146353476, 1408…
# need to group by state
summary_co2 <- clean_data %>%
group_by(State) %>%
summarise(mean = mean(CO2Mass), std_dev = sd(CO2Mass), n = n())
glimpse(summary_co2)
## Rows: 49
## Columns: 4
## $ State <chr> "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "I…
## $ mean <dbl> 3398371.2, 2002796.4, 2342109.0, 416463.2, 1776612.6, 540502.5…
## $ std_dev <dbl> 5191485.93, 3415015.41, 3981855.91, 657256.50, 2509585.76, 752…
## $ n <int> 516, 389, 532, 2090, 543, 341, 13, 217, 1360, 724, 520, 96, 11…
summary_co2_by_year <- clean_data %>%
group_by(Year) %>%
summarise(mean = mean(CO2Mass, na.rm = TRUE), # CO2의 평균 (NA값 제외)
std_dev = sd(CO2Mass, na.rm = TRUE),
n = n())
glimpse(summary_co2_by_year)
## Rows: 23
## Columns: 4
## $ Year <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 20…
## $ mean <dbl> 2968348, 2550567, 2215485, 2116357, 2111045, 2111633, 2045790,…
## $ std_dev <dbl> 4152778, 3858703, 3722301, 3682612, 3691496, 3720163, 3682011,…
## $ n <int> 826, 939, 1094, 1165, 1174, 1203, 1219, 1240, 1283, 1244, 1265…
This can be done with filter command. Then rename co2 column for 2022 to co2_2022 and co2 for 2021 to co2_2021.
data_2022 <- summary_co2_by_year %>%
filter(Year == 2022) %>% # Year가 2022인 데이터만 필터링
rename(co2_2022 = mean) # CO2의 평균을 co2_2022로 변경
data_2021 <- summary_co2_by_year %>%
filter(Year == 2021) %>% # Year가 2021인 데이터만 필터링
rename(co2_2021 = mean)
#checking for data 결과 확인
glimpse(data_2022)
## Rows: 1
## Columns: 4
## $ Year <int> 2022
## $ co2_2022 <dbl> 1440012
## $ std_dev <dbl> 2357372
## $ n <int> 1172
glimpse(data_2021)
## Rows: 1
## Columns: 4
## $ Year <int> 2021
## $ co2_2021 <dbl> 1454349
## $ std_dev <dbl> 2447018
## $ n <int> 1169
combined_data <- left_join(data_2021, data_2022, by = "Year")
#checking for date 결과 확인
print(combined_data)
## # A tibble: 1 × 7
## Year co2_2021 std_dev.x n.x co2_2022 std_dev.y n.y
## <int> <dbl> <dbl> <int> <dbl> <dbl> <int>
## 1 2021 1454349. 2447018. 1169 NA NA NA