Define Libraries

library(tidyr) #for gather, drop
library(dplyr) #for glimpse 
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Setting directory and reading data

# Setting directory 작업 디렉토리 설정
setwd("C:/Users/chan0/Desktop/econ 390 assignment 1")

# Reading download data 데이터 불러오기기
data <- read.csv("annual-emissions.csv")

STEP-2A View the first 6 rows of the data with head

# Using head(data) 데이터의 6행까지 출력
head(data)
##   State Facility.Name Facility.ID Year Gross.Load..MWh. Steam.Load..1000.lb.
## 1    AL         Barry           3 2000         12787607                   NA
## 2    AL         Barry           3 2001         16137285                   NA
## 3    AL         Barry           3 2002         17931286                   NA
## 4    AL         Barry           3 2003         15475217                   NA
## 5    AL         Barry           3 2004         15516154                   NA
## 6    AL         Barry           3 2005         15471435                   NA
##   SO2.Mass..short.tons. CO2.Mass..short.tons. NOx.Mass..short.tons.
## 1              61331.11              12435918              23667.13
## 2              65902.38              13683576              23026.47
## 3              59524.80              14700456              24400.65
## 4              52706.28              13941717              22451.15
## 5              47470.36              13217576              19312.65
## 6              53652.94              13716973              19248.53
##   Heat.Input..mmBtu.
## 1          124857223
## 2          147934446
## 3          162155771
## 4          146353476
## 5          140814946
## 6          143241269

STEP-2B select state, facility name, Year, Gross Load, CO2 MASS

# Checking for the name 열 이름 출력
colnames(data)
##  [1] "State"                 "Facility.Name"         "Facility.ID"          
##  [4] "Year"                  "Gross.Load..MWh."      "Steam.Load..1000.lb." 
##  [7] "SO2.Mass..short.tons." "CO2.Mass..short.tons." "NOx.Mass..short.tons."
## [10] "Heat.Input..mmBtu."
# Select the data 
selected_data <- data[, c("State", "Facility.Name", "Year", "Gross.Load..MWh.", "CO2.Mass..short.tons.")]

STEP-2C Change column names of at least one variable

# Change Gross.Load..MWH. TO GrossLoad 열 이름 바꾸기
colnames(data)[colnames(data) == "Gross.Load..MWh."] <- "GrossLoad"
# Change CO2.Mass..short.tons. TO CO2Mass
colnames(data)[colnames(data) == "CO2.Mass..short.tons."] <- "CO2Mass"

STEP-2D Check the structure of each variable with str()

# Display overview of the structure 간단한 구조를 보여주는 것
str(data)
## 'data.frame':    32517 obs. of  10 variables:
##  $ State                : chr  "AL" "AL" "AL" "AL" ...
##  $ Facility.Name        : chr  "Barry" "Barry" "Barry" "Barry" ...
##  $ Facility.ID          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Year                 : int  2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
##  $ GrossLoad            : num  12787607 16137285 17931286 15475217 15516154 ...
##  $ Steam.Load..1000.lb. : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ SO2.Mass..short.tons.: num  61331 65902 59525 52706 47470 ...
##  $ CO2Mass              : num  12435918 13683576 14700456 13941717 13217576 ...
##  $ NOx.Mass..short.tons.: num  23667 23026 24401 22451 19313 ...
##  $ Heat.Input..mmBtu.   : num  1.25e+08 1.48e+08 1.62e+08 1.46e+08 1.41e+08 ...

STEP-2E Change the format of the data to long.

# Using Gather to transform to long format
data_long <- gather(data, 
                  key = "Measure",     # create "measure" and add Grossload,CO2mass together
                  value = "Value",     # the value output
                  GrossLoad, CO2Mass)
# Preview of the data set 데이터 구조 미리보기 
glimpse(data_long)
## Rows: 65,034
## Columns: 10
## $ State                 <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name         <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID           <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year                  <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ Steam.Load..1000.lb.  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu.    <dbl> 124857223, 147934446, 162155771, 146353476, 1408…
## $ Measure               <chr> "GrossLoad", "GrossLoad", "GrossLoad", "GrossLoa…
## $ Value                 <dbl> 12787607, 16137285, 17931286, 15475217, 15516154…

STEP-2F Have the data back to wide format.

# Using spread to undo from long format
data_wide <- spread(data_long, key = "Measure", value = "Value")
# Preview of the data set 데이터 구조 미리보기 
glimpse(data_wide)
## Rows: 32,517
## Columns: 10
## $ State                 <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name         <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID           <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year                  <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ Steam.Load..1000.lb.  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu.    <dbl> 124857223, 147934446, 162155771, 146353476, 1408…
## $ CO2Mass               <dbl> 12435918, 13683576, 14700456, 13941717, 13217576…
## $ GrossLoad             <dbl> 12787607, 16137285, 17931286, 15475217, 15516154…

STEP-2G Select all columns except Load.

# Select columns that include "load"?
selected_data2 <- data %>% select(-contains("Load"))

#Double checking for preview
glimpse(selected_data2)
## Rows: 32,517
## Columns: 8
## $ State                 <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name         <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID           <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year                  <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ CO2Mass               <dbl> 12435918, 13683576, 14700456, 13941717, 13217576…
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu.    <dbl> 124857223, 147934446, 162155771, 146353476, 1408…

STEP-2H Remove all rows with “NA” (that is missing values).

clean_data <- drop_na(selected_data2) #delete rows with NA 
glimpse(clean_data)
## Rows: 27,291
## Columns: 8
## $ State                 <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", …
## $ Facility.Name         <chr> "Barry", "Barry", "Barry", "Barry", "Barry", "Ba…
## $ Facility.ID           <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ Year                  <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
## $ SO2.Mass..short.tons. <dbl> 61331.107, 65902.382, 59524.802, 52706.279, 4747…
## $ CO2Mass               <dbl> 12435917.8, 13683576.3, 14700455.8, 13941716.7, …
## $ NOx.Mass..short.tons. <dbl> 23667.132, 23026.466, 24400.653, 22451.149, 1931…
## $ Heat.Input..mmBtu.    <dbl> 124857223, 147934446, 162155771, 146353476, 1408…

STEP-2I Calculate the Mean and SD of CO2 by State.

# need to group by state 
summary_co2 <- clean_data %>%
  group_by(State) %>%
  summarise(mean = mean(CO2Mass), std_dev = sd(CO2Mass), n = n())
glimpse(summary_co2)
## Rows: 49
## Columns: 4
## $ State   <chr> "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "I…
## $ mean    <dbl> 3398371.2, 2002796.4, 2342109.0, 416463.2, 1776612.6, 540502.5…
## $ std_dev <dbl> 5191485.93, 3415015.41, 3981855.91, 657256.50, 2509585.76, 752…
## $ n       <int> 516, 389, 532, 2090, 543, 341, 13, 217, 1360, 724, 520, 96, 11…

STEP-2J Repeat the calculation of Mean and SD of CO2 by Year

summary_co2_by_year <- clean_data %>%
  group_by(Year) %>%
  summarise(mean = mean(CO2Mass, na.rm = TRUE),    # CO2의 평균 (NA값 제외)
            std_dev = sd(CO2Mass, na.rm = TRUE),    
            n = n())                               
glimpse(summary_co2_by_year)
## Rows: 23
## Columns: 4
## $ Year    <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 20…
## $ mean    <dbl> 2968348, 2550567, 2215485, 2116357, 2111045, 2111633, 2045790,…
## $ std_dev <dbl> 4152778, 3858703, 3722301, 3682612, 3691496, 3720163, 3682011,…
## $ n       <int> 826, 939, 1094, 1165, 1174, 1203, 1219, 1240, 1283, 1244, 1265…

STEP-2K Subset the data for only 2022 (that is have only for Year 2022). Subset for 2021 as well.

This can be done with filter command. Then rename co2 column for 2022 to co2_2022 and co2 for 2021 to co2_2021.

data_2022 <- summary_co2_by_year %>%
  filter(Year == 2022) %>%             # Year가 2022인 데이터만 필터링
  rename(co2_2022 = mean)              # CO2의 평균을 co2_2022로 변경

data_2021 <- summary_co2_by_year %>%
  filter(Year == 2021) %>%             # Year가 2021인 데이터만 필터링
  rename(co2_2021 = mean) 
#checking for data 결과 확인
glimpse(data_2022)
## Rows: 1
## Columns: 4
## $ Year     <int> 2022
## $ co2_2022 <dbl> 1440012
## $ std_dev  <dbl> 2357372
## $ n        <int> 1172
glimpse(data_2021)
## Rows: 1
## Columns: 4
## $ Year     <int> 2021
## $ co2_2021 <dbl> 1454349
## $ std_dev  <dbl> 2447018
## $ n        <int> 1169

Left join these two data sets (data for 2021 and 2022)

combined_data <- left_join(data_2021, data_2022, by = "Year")
#checking for date 결과 확인
print(combined_data)
## # A tibble: 1 × 7
##    Year co2_2021 std_dev.x   n.x co2_2022 std_dev.y   n.y
##   <int>    <dbl>     <dbl> <int>    <dbl>     <dbl> <int>
## 1  2021 1454349.  2447018.  1169       NA        NA    NA