setwd("D:\\AUCA\\R for data science")
getwd()
## [1] "D:/AUCA/R for data science"
list.files()
##  [1] "20251MBI024_in"                                        
##  [2] "230201_MS_BIG_DATA_PROJECT_2_PN_HD.pdf"                
##  [3] "archive"                                               
##  [4] "archive.zip"                                           
##  [5] "class 2.Rmd"                                           
##  [6] "Class 2_Cont.Rmd"                                      
##  [7] "Class 3 and 4.Rmd"                                     
##  [8] "Class1.Rmd"                                            
##  [9] "Class1_2026_materials.pdf"                             
## [10] "class2.Rmd"                                            
## [11] "Class4.html"                                           
## [12] "CO2_emission.csv"                                      
## [13] "exercise.aux"                                          
## [14] "exercise.log"                                          
## [15] "exercise.pdf"                                          
## [16] "exercise.synctex.gz"                                   
## [17] "exercise.tex"                                          
## [18] "exercise.toc"                                          
## [19] "practice.Rmd"                                          
## [20] "R_PROGRAMMING_FOR_DATA_SCIENCE_2014_-_Roger_D_Peng.pdf"
## [21] "world_population.csv"
#load data sets
population <- read.csv("world_population.csv")
co2 <- read.csv("CO2_emission.csv")

Exploratory data analysis

#display variable name
colnames(population)
##  [1] "Rank"                        "CCA3"                       
##  [3] "Country.Territory"           "Capital"                    
##  [5] "Continent"                   "X2022.Population"           
##  [7] "X2020.Population"            "X2015.Population"           
##  [9] "X2010.Population"            "X2000.Population"           
## [11] "X1990.Population"            "X1980.Population"           
## [13] "X1970.Population"            "Area..km.."                 
## [15] "Density..per.km.."           "Growth.Rate"                
## [17] "World.Population.Percentage"
#the top 5 rows
head(population, 5)
##   Rank CCA3 Country.Territory          Capital Continent X2022.Population
## 1   36  AFG       Afghanistan            Kabul      Asia         41128771
## 2  138  ALB           Albania           Tirana    Europe          2842321
## 3   34  DZA           Algeria          Algiers    Africa         44903225
## 4  213  ASM    American Samoa        Pago Pago   Oceania            44273
## 5  203  AND           Andorra Andorra la Vella    Europe            79824
##   X2020.Population X2015.Population X2010.Population X2000.Population
## 1         38972230         33753499         28189672         19542982
## 2          2866849          2882481          2913399          3182021
## 3         43451666         39543154         35856344         30774621
## 4            46189            51368            54849            58230
## 5            77700            71746            71519            66097
##   X1990.Population X1980.Population X1970.Population Area..km..
## 1         10694796         12486631         10752971     652230
## 2          3295066          2941651          2324731      28748
## 3         25518074         18739378         13795915    2381741
## 4            47818            32886            27075        199
## 5            53569            35611            19860        468
##   Density..per.km.. Growth.Rate World.Population.Percentage
## 1           63.0587      1.0257                        0.52
## 2           98.8702      0.9957                        0.04
## 3           18.8531      1.0164                        0.56
## 4          222.4774      0.9831                        0.00
## 5          170.5641      1.0100                        0.00
#the bottom 10 rows
tail(population, 10)
##     Rank CCA3 Country.Territory      Capital     Continent X2022.Population
## 225   43  UZB        Uzbekistan     Tashkent          Asia         34627652
## 226  181  VUT           Vanuatu    Port-Vila       Oceania           326740
## 227  234  VAT      Vatican City Vatican City        Europe              510
## 228   51  VEN         Venezuela      Caracas South America         28301696
## 229   16  VNM           Vietnam        Hanoi          Asia         98186856
## 230  226  WLF Wallis and Futuna     Mata-Utu       Oceania            11572
## 231  172  ESH    Western Sahara     El Aaiún        Africa           575986
## 232   46  YEM             Yemen        Sanaa          Asia         33696614
## 233   63  ZMB            Zambia       Lusaka        Africa         20017675
## 234   74  ZWE          Zimbabwe       Harare        Africa         16320537
##     X2020.Population X2015.Population X2010.Population X2000.Population
## 225         33526656         30949417         28614227         24925554
## 226           311685           276438           245453           192074
## 227              520              564              596              651
## 228         28490453         30529716         28715022         24427729
## 229         96648685         92191398         87411012         79001142
## 230            11655            12182            13142            14723
## 231           556048           491824           413296           270375
## 232         32284046         28516545         24743946         18628700
## 233         18927715         16248230         13792086          9891136
## 234         15669666         14154937         12839771         11834676
##     X1990.Population X1980.Population X1970.Population Area..km..
## 225         20579100         15947129         12011361     447400
## 226           150882           118156            87019      12189
## 227              700              733              752          1
## 228         19750579         15210443         11355475     916445
## 229         66912613         52968270         41928849     331212
## 230            13454            11315             9377        142
## 231           178529           116775            76371     266000
## 232         13375121          9204938          6843607     527968
## 233          7686401          5720438          4281671     752612
## 234         10113893          7049926          5202918     390757
##     Density..per.km.. Growth.Rate World.Population.Percentage
## 225           77.3975      1.0160                        0.43
## 226           26.8061      1.0238                        0.00
## 227          510.0000      0.9980                        0.00
## 228           30.8820      1.0036                        0.35
## 229          296.4472      1.0074                        1.23
## 230           81.4930      0.9953                        0.00
## 231            2.1654      1.0184                        0.01
## 232           63.8232      1.0217                        0.42
## 233           26.5976      1.0280                        0.25
## 234           41.7665      1.0204                        0.20
#the data type
str(population)
## 'data.frame':    234 obs. of  17 variables:
##  $ Rank                       : int  36 138 34 213 203 42 224 201 33 140 ...
##  $ CCA3                       : chr  "AFG" "ALB" "DZA" "ASM" ...
##  $ Country.Territory          : chr  "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ Capital                    : chr  "Kabul" "Tirana" "Algiers" "Pago Pago" ...
##  $ Continent                  : chr  "Asia" "Europe" "Africa" "Oceania" ...
##  $ X2022.Population           : int  41128771 2842321 44903225 44273 79824 35588987 15857 93763 45510318 2780469 ...
##  $ X2020.Population           : int  38972230 2866849 43451666 46189 77700 33428485 15585 92664 45036032 2805608 ...
##  $ X2015.Population           : int  33753499 2882481 39543154 51368 71746 28127721 14525 89941 43257065 2878595 ...
##  $ X2010.Population           : int  28189672 2913399 35856344 54849 71519 23364185 13172 85695 41100123 2946293 ...
##  $ X2000.Population           : int  19542982 3182021 30774621 58230 66097 16394062 11047 75055 37070774 3168523 ...
##  $ X1990.Population           : int  10694796 3295066 25518074 47818 53569 11828638 8316 63328 32637657 3556539 ...
##  $ X1980.Population           : int  12486631 2941651 18739378 32886 35611 8330047 6560 64888 28024803 3135123 ...
##  $ X1970.Population           : int  10752971 2324731 13795915 27075 19860 6029700 6283 64516 23842803 2534377 ...
##  $ Area..km..                 : int  652230 28748 2381741 199 468 1246700 91 442 2780400 29743 ...
##  $ Density..per.km..          : num  63.1 98.9 18.9 222.5 170.6 ...
##  $ Growth.Rate                : num  1.026 0.996 1.016 0.983 1.01 ...
##  $ World.Population.Percentage: num  0.52 0.04 0.56 0 0 0.45 0 0 0.57 0.03 ...
#the shape of the dataset
dim(population)
## [1] 234  17
#check duplicate
sum(duplicated(population))
## [1] 0
population <- population[!duplicated(population), ]
# find the number of missing values in each column
colSums(is.na(population))
##                        Rank                        CCA3 
##                           0                           0 
##           Country.Territory                     Capital 
##                           0                           0 
##                   Continent            X2022.Population 
##                           0                           0 
##            X2020.Population            X2015.Population 
##                           0                           0 
##            X2010.Population            X2000.Population 
##                           0                           0 
##            X1990.Population            X1980.Population 
##                           0                           0 
##            X1970.Population                  Area..km.. 
##                           0                           0 
##           Density..per.km..                 Growth.Rate 
##                           0                           0 
## World.Population.Percentage 
##                           0
#Boxplot for outliers
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
numeric_cols <- population %>% select(where(is.numeric))

boxplot(numeric_cols, main = "Boxplot of Numeric Variables", las = 2)
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow

colSums(is.na(population))
##                        Rank                        CCA3 
##                           0                           0 
##           Country.Territory                     Capital 
##                           0                           0 
##                   Continent            X2022.Population 
##                           0                           0 
##            X2020.Population            X2015.Population 
##                           0                           0 
##            X2010.Population            X2000.Population 
##                           0                           0 
##            X1990.Population            X1980.Population 
##                           0                           0 
##            X1970.Population                  Area..km.. 
##                           0                           0 
##           Density..per.km..                 Growth.Rate 
##                           0                           0 
## World.Population.Percentage 
##                           0
sum(duplicated(population))
## [1] 0

#Generating newVariable by using World Population Dataset

summary(population$Growth.Rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.912   1.002   1.008   1.010   1.017   1.069
t <- 8
population$Population_2030 <- population$X2022.Population * 
                              (population$Growth.Rate ^ 8)
head(population[, c("Country.Territory", "X2022.Population", "Growth.Rate", "Population_2030")])
##   Country.Territory X2022.Population Growth.Rate Population_2030
## 1       Afghanistan         41128771      1.0257     50385848.48
## 2           Albania          2842321      0.9957      2746004.10
## 3           Algeria         44903225      1.0164     51144010.98
## 4    American Samoa            44273      0.9831        38629.63
## 5           Andorra            79824      1.0100        86437.95
## 6            Angola         35588987      1.0315     45610988.54
summary(population$Growth.Rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.912   1.002   1.008   1.010   1.017   1.069
t <- 8
population$r <- log(population$Growth.Rate)
population$Population_2030 <- population$X2022.Population * 
                              exp(population$r * t)
head(population[, c("Country.Territory", "X2022.Population", "Growth.Rate", "Population_2030")])
##   Country.Territory X2022.Population Growth.Rate Population_2030
## 1       Afghanistan         41128771      1.0257     50385848.48
## 2           Albania          2842321      0.9957      2746004.10
## 3           Algeria         44903225      1.0164     51144010.98
## 4    American Samoa            44273      0.9831        38629.63
## 5           Andorra            79824      1.0100        86437.95
## 6            Angola         35588987      1.0315     45610988.54

#Value extraction and plot

top10 <- population %>%
  arrange(desc(X2022.Population)) %>%
  slice(1:10)
top10$Country.Territory
##  [1] "China"         "India"         "United States" "Indonesia"    
##  [5] "Pakistan"      "Nigeria"       "Brazil"        "Bangladesh"   
##  [9] "Russia"        "Mexico"
ggplot(top10, aes(x = reorder(Country.Territory, X2022.Population), 
                  y = X2022.Population)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 10 Most Populous Countries (2022)",
       x = "Country",
       y = "Population") +
  theme_minimal()

top10_trend <- top10 %>%
  select(Country.Territory,
         X1990.Population,
         X2000.Population,
         X2010.Population,
         X2015.Population,
         X2020.Population,
         X2022.Population)
top10_long <- top10_trend %>%
  pivot_longer(cols = -Country.Territory,
               names_to = "Year",
               values_to = "Population")
top10_long$Year <- gsub("X", "", top10_long$Year)
top10_long$Year <- gsub(".Population", "", top10_long$Year)
top10_long$Year <- as.numeric(top10_long$Year)
ggplot(top10_long, aes(x = Year, 
                       y = Population, 
                       color = Country.Territory)) +
  geom_line() +
  geom_point() +
  labs(title = "Population Trend of Top 10 Countries (1990–2022)",
       x = "Year",
       y = "Population") +
  theme_minimal()

ggplot(top10_long, aes(x = Year, 
                       y = Population, 
                       color = Country.Territory)) +
  geom_line() +
  geom_point() +
  labs(title = "Population Trend of Top 10 Countries (1990–2022)",
       x = "Year",
       y = "Population") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()

ggplot(top10_long, aes(x = Year, 
                       y = Population / 1e6, 
                       color = Country.Territory)) +
  geom_line() +
  geom_point() +
  labs(title = "Population Trend of Top 10 Countries (1990–2022)",
       x = "Year",
       y = "Population (Millions)") +
  theme_minimal()

colnames(co2)
##  [1] "Country.Name"   "country_code"   "Region"         "Indicator.Name"
##  [5] "X1990"          "X1991"          "X1992"          "X1993"         
##  [9] "X1994"          "X1995"          "X1996"          "X1997"         
## [13] "X1998"          "X1999"          "X2000"          "X2001"         
## [17] "X2002"          "X2003"          "X2004"          "X2005"         
## [21] "X2006"          "X2007"          "X2008"          "X2009"         
## [25] "X2010"          "X2011"          "X2012"          "X2013"         
## [29] "X2014"          "X2015"          "X2016"          "X2017"         
## [33] "X2018"          "X2019"          "X2019.1"
top10$Country.Territory %in% co2$Country.Name
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
top10$Country.Territory[!(top10$Country.Territory %in% co2$Country.Name)]
## [1] "Russia"
co2$Country.Name[co2$Country.Name == "Russian Federation"] <- "Russia"
top10$Country.Territory %in% co2$Country.Name
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
co2_top10 <- co2 %>%
  filter(Country.Name %in% top10$Country.Territory)
co2_trend <- co2_top10 %>%
  select(Country.Name, starts_with("X"))

co2_long <- co2_trend %>%
  pivot_longer(cols = -Country.Name,
               names_to = "Year",
               values_to = "CO2")

co2_long$Year <- gsub("X", "", co2_long$Year)
co2_long$Year <- as.numeric(co2_long$Year)
ggplot(co2_long, aes(x = Year, y = CO2, color = Country.Name)) +
  geom_line() +
  labs(title = "CO2 Emissions Trend (1990–2019)",
       x = "Year",
       y = "CO2 Emissions") +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal()

co2_long %>%
  group_by(Country.Name) %>%
  summarise(max_CO2 = max(CO2, na.rm = TRUE)) %>%
  arrange(desc(max_CO2))
## # A tibble: 10 × 2
##    Country.Name  max_CO2
##    <chr>           <dbl>
##  1 United States  20.5  
##  2 Russia         14.6  
##  3 China           7.61 
##  4 Mexico          4.19 
##  5 Brazil          2.52 
##  6 Indonesia       2.29 
##  7 India           1.81 
##  8 Pakistan        0.956
##  9 Nigeria         0.917
## 10 Bangladesh      0.557
str(co2_long)
## tibble [310 × 3] (S3: tbl_df/tbl/data.frame)
##  $ Country.Name: chr [1:310] "Bangladesh" "Bangladesh" "Bangladesh" "Bangladesh" ...
##  $ Year        : num [1:310] 1990 1991 1992 1993 1994 ...
##  $ CO2         : num [1:310] 0.112 0.103 0.109 0.114 0.12 ...
unique(co2$Country.Name)
##   [1] "Aruba"                          "Afghanistan"                   
##   [3] "Angola"                         "Albania"                       
##   [5] "Andorra"                        "United Arab Emirates"          
##   [7] "Argentina"                      "Armenia"                       
##   [9] "American Samoa"                 "Antigua and Barbuda"           
##  [11] "Australia"                      "Austria"                       
##  [13] "Azerbaijan"                     "Burundi"                       
##  [15] "Belgium"                        "Benin"                         
##  [17] "Burkina Faso"                   "Bangladesh"                    
##  [19] "Bulgaria"                       "Bahrain"                       
##  [21] "Bahamas, The"                   "Bosnia and Herzegovina"        
##  [23] "Belarus"                        "Belize"                        
##  [25] "Bermuda"                        "Bolivia"                       
##  [27] "Brazil"                         "Barbados"                      
##  [29] "Brunei Darussalam"              "Bhutan"                        
##  [31] "Botswana"                       "Central African Republic"      
##  [33] "Canada"                         "Switzerland"                   
##  [35] "Chile"                          "China"                         
##  [37] "Cote d'Ivoire"                  "Cameroon"                      
##  [39] "Congo, Dem. Rep."               "Congo, Rep."                   
##  [41] "Colombia"                       "Comoros"                       
##  [43] "Cabo Verde"                     "Costa Rica"                    
##  [45] "Cuba"                           "Curacao"                       
##  [47] "Cayman Islands"                 "Cyprus"                        
##  [49] "Czech Republic"                 "Germany"                       
##  [51] "Djibouti"                       "Dominica"                      
##  [53] "Denmark"                        "Dominican Republic"            
##  [55] "Algeria"                        "Ecuador"                       
##  [57] "Egypt, Arab Rep."               "Eritrea"                       
##  [59] "Spain"                          "Estonia"                       
##  [61] "Ethiopia"                       "Finland"                       
##  [63] "Fiji"                           "France"                        
##  [65] "Faroe Islands"                  "Micronesia, Fed. Sts."         
##  [67] "Gabon"                          "United Kingdom"                
##  [69] "Georgia"                        "Ghana"                         
##  [71] "Gibraltar"                      "Guinea"                        
##  [73] "Gambia, The"                    "Guinea-Bissau"                 
##  [75] "Equatorial Guinea"              "Greece"                        
##  [77] "Grenada"                        "Greenland"                     
##  [79] "Guatemala"                      "Guam"                          
##  [81] "Guyana"                         "Hong Kong SAR, China"          
##  [83] "Honduras"                       "Croatia"                       
##  [85] "Haiti"                          "Hungary"                       
##  [87] "Indonesia"                      "Isle of Man"                   
##  [89] "India"                          "Ireland"                       
##  [91] "Iran, Islamic Rep."             "Iraq"                          
##  [93] "Iceland"                        "Israel"                        
##  [95] "Italy"                          "Jamaica"                       
##  [97] "Jordan"                         "Japan"                         
##  [99] "Kazakhstan"                     "Kenya"                         
## [101] "Kyrgyz Republic"                "Cambodia"                      
## [103] "Kiribati"                       "St. Kitts and Nevis"           
## [105] "Korea, Rep."                    "Kuwait"                        
## [107] "Lao PDR"                        "Lebanon"                       
## [109] "Liberia"                        "Libya"                         
## [111] "St. Lucia"                      "Liechtenstein"                 
## [113] "Sri Lanka"                      "Lesotho"                       
## [115] "Lithuania"                      "Luxembourg"                    
## [117] "Latvia"                         "Macao SAR, China"              
## [119] "St. Martin (French part)"       "Morocco"                       
## [121] "Monaco"                         "Moldova"                       
## [123] "Madagascar"                     "Maldives"                      
## [125] "Mexico"                         "Marshall Islands"              
## [127] "North Macedonia"                "Mali"                          
## [129] "Malta"                          "Myanmar"                       
## [131] "Montenegro"                     "Mongolia"                      
## [133] "Northern Mariana Islands"       "Mozambique"                    
## [135] "Mauritania"                     "Mauritius"                     
## [137] "Malawi"                         "Malaysia"                      
## [139] "Namibia"                        "New Caledonia"                 
## [141] "Niger"                          "Nigeria"                       
## [143] "Nicaragua"                      "Netherlands"                   
## [145] "Norway"                         "Nepal"                         
## [147] "Nauru"                          "New Zealand"                   
## [149] "Oman"                           "Pakistan"                      
## [151] "Panama"                         "Peru"                          
## [153] "Philippines"                    "Palau"                         
## [155] "Papua New Guinea"               "Poland"                        
## [157] "Puerto Rico"                    "Korea, Dem. People's Rep."     
## [159] "Portugal"                       "Paraguay"                      
## [161] "West Bank and Gaza"             "French Polynesia"              
## [163] "Qatar"                          "Romania"                       
## [165] "Russia"                         "Rwanda"                        
## [167] "Saudi Arabia"                   "Sudan"                         
## [169] "Senegal"                        "Singapore"                     
## [171] "Solomon Islands"                "Sierra Leone"                  
## [173] "El Salvador"                    "San Marino"                    
## [175] "Somalia"                        "Serbia"                        
## [177] "South Sudan"                    "Sao Tome and Principe"         
## [179] "Suriname"                       "Slovak Republic"               
## [181] "Slovenia"                       "Sweden"                        
## [183] "Eswatini"                       "Sint Maarten (Dutch part)"     
## [185] "Seychelles"                     "Syrian Arab Republic"          
## [187] "Turks and Caicos Islands"       "Chad"                          
## [189] "Togo"                           "Thailand"                      
## [191] "Tajikistan"                     "Turkmenistan"                  
## [193] "Timor-Leste"                    "Tonga"                         
## [195] "Trinidad and Tobago"            "Tunisia"                       
## [197] "Turkiye"                        "Tuvalu"                        
## [199] "Tanzania"                       "Uganda"                        
## [201] "Ukraine"                        "Uruguay"                       
## [203] "United States"                  "Uzbekistan"                    
## [205] "St. Vincent and the Grenadines" "Venezuela, RB"                 
## [207] "British Virgin Islands"         "Virgin Islands (U.S.)"         
## [209] "Vietnam"                        "Vanuatu"                       
## [211] "Samoa"                          "Yemen, Rep."                   
## [213] "South Africa"                   "Zambia"                        
## [215] "Zimbabwe"
top10$Country.Territory
##  [1] "China"         "India"         "United States" "Indonesia"    
##  [5] "Pakistan"      "Nigeria"       "Brazil"        "Bangladesh"   
##  [9] "Russia"        "Mexico"
colnames(co2)
##  [1] "Country.Name"   "country_code"   "Region"         "Indicator.Name"
##  [5] "X1990"          "X1991"          "X1992"          "X1993"         
##  [9] "X1994"          "X1995"          "X1996"          "X1997"         
## [13] "X1998"          "X1999"          "X2000"          "X2001"         
## [17] "X2002"          "X2003"          "X2004"          "X2005"         
## [21] "X2006"          "X2007"          "X2008"          "X2009"         
## [25] "X2010"          "X2011"          "X2012"          "X2013"         
## [29] "X2014"          "X2015"          "X2016"          "X2017"         
## [33] "X2018"          "X2019"          "X2019.1"
str(co2)
## 'data.frame':    215 obs. of  35 variables:
##  $ Country.Name  : chr  "Aruba" "Afghanistan" "Angola" "Albania" ...
##  $ country_code  : chr  "ABW" "AFG" "AGO" "ALB" ...
##  $ Region        : chr  "Latin America & Caribbean" "South Asia" "Sub-Saharan Africa" "Europe & Central Asia" ...
##  $ Indicator.Name: chr  "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" ...
##  $ X1990         : num  NA 0.192 0.554 1.82 7.522 ...
##  $ X1991         : num  NA 0.168 0.545 1.243 7.235 ...
##  $ X1992         : num  NA 0.096 0.544 0.684 6.963 ...
##  $ X1993         : num  NA 0.0847 0.709 0.6383 6.7242 ...
##  $ X1994         : num  NA 0.0755 0.8368 0.6454 6.5416 ...
##  $ X1995         : num  NA 0.0685 0.9121 0.6054 6.7335 ...
##  $ X1996         : num  NA 0.0626 1.0722 0.6124 6.9916 ...
##  $ X1997         : num  NA 0.0568 1.0866 0.4669 7.3074 ...
##  $ X1998         : num  NA 0.0527 1.0918 0.5722 7.6395 ...
##  $ X1999         : num  NA 0.0402 1.1099 0.9554 7.9232 ...
##  $ X2000         : num  NA 0.0366 0.9881 1.0262 7.9523 ...
##  $ X2001         : num  NA 0.0338 0.9418 1.0555 7.7215 ...
##  $ X2002         : num  NA 0.0456 0.8956 1.2324 7.5662 ...
##  $ X2003         : num  NA 0.0515 0.9249 1.339 7.2424 ...
##  $ X2004         : num  NA 0.0417 0.9303 1.4041 7.3443 ...
##  $ X2005         : num  NA 0.0604 0.8135 1.3382 7.3538 ...
##  $ X2006         : num  NA 0.0666 0.8218 1.34 6.7905 ...
##  $ X2007         : num  NA 0.0653 0.8118 1.3939 6.531 ...
##  $ X2008         : num  NA 0.128 0.889 1.384 6.439 ...
##  $ X2009         : num  NA 0.172 0.939 1.441 6.157 ...
##  $ X2010         : num  NA 0.244 0.976 1.528 6.157 ...
##  $ X2011         : num  NA 0.297 0.986 1.669 5.851 ...
##  $ X2012         : num  NA 0.259 0.951 1.503 5.945 ...
##  $ X2013         : num  NA 0.186 1.036 1.534 5.943 ...
##  $ X2014         : num  NA 0.146 1.1 1.668 5.807 ...
##  $ X2015         : num  NA 0.173 1.135 1.604 6.026 ...
##  $ X2016         : num  NA 0.15 1.03 1.56 6.08 ...
##  $ X2017         : num  NA 0.132 0.813 1.789 6.104 ...
##  $ X2018         : num  NA 0.163 0.778 1.783 6.363 ...
##  $ X2019         : num  NA 0.16 0.792 1.692 6.481 ...
##  $ X2019.1       : num  NA 0.16 0.792 1.692 6.481 ...
co2 <- read.csv("CO2_emission.csv")
dim(co2)
## [1] 215  35
top10$Country.Territory
##  [1] "China"         "India"         "United States" "Indonesia"    
##  [5] "Pakistan"      "Nigeria"       "Brazil"        "Bangladesh"   
##  [9] "Russia"        "Mexico"
co2_top10 <- co2 %>%
  filter(Country.Name %in% top10$Country.Territory)
dim(co2_top10)
## [1]  9 35
co2_trend <- co2_top10 %>%
  select(Country.Name, starts_with("X"))
co2_long <- co2_trend %>%
  pivot_longer(cols = -Country.Name,
               names_to = "Year",
               values_to = "CO2")
co2_long$Year <- as.numeric(gsub("X", "", co2_long$Year))
co2_long$CO2 <- as.numeric(co2_long$CO2)
co2_long <- na.omit(co2_long)
ggplot(co2_long, aes(x = Year, y = CO2, color = Country.Name)) +
  geom_line() +
  geom_point() +
  theme_minimal()

co2_long$Country.Name <- factor(
  co2_long$Country.Name,
  levels = co2_long %>%
    group_by(Country.Name) %>%
    summarise(max_CO2 = max(CO2, na.rm = TRUE)) %>%
    arrange(desc(max_CO2)) %>%
    pull(Country.Name)
)
ggplot(co2_long, aes(x = Year, y = CO2, color = Country.Name)) +
  geom_line() +
  geom_point() +
  theme_minimal()

#Correlation Analysis

corr_data <- population %>%
  select(Area..km..,
         Density..per.km..,
         Growth.Rate,
         World.Population.Percentage)
corr_data <- population %>%
  select(Area..km..,
         Density..per.km..,
         Growth.Rate,
         World.Population.Percentage)
cor_matrix <- cor(corr_data)

cor_matrix
##                              Area..km.. Density..per.km.. Growth.Rate
## Area..km..                   1.00000000       -0.06312785 -0.01397017
## Density..per.km..           -0.06312785        1.00000000 -0.06975328
## Growth.Rate                 -0.01397017       -0.06975328  1.00000000
## World.Population.Percentage  0.45328363       -0.02764600 -0.02092954
##                             World.Population.Percentage
## Area..km..                                   0.45328363
## Density..per.km..                           -0.02764600
## Growth.Rate                                 -0.02092954
## World.Population.Percentage                  1.00000000
library(ggplot2)

corr_long <- as.data.frame(as.table(cor_matrix))

ggplot(corr_long, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = round(Freq, 2))) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
  theme_minimal() +
  labs(title = "Correlation Heatmap")

pop_2022 <- population %>%
  select(Country.Territory, X2022.Population)
co2_years <- co2[, grep("^X", colnames(co2))]
co2_years_num <- lapply(co2_years, as.numeric)
lapply(co2[, 5:35], summary)
## $X1990
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##  0.001183  0.434249  1.914543  4.404504  6.910335 30.195189        30 
## 
## $X1991
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##  0.001158  0.452759  1.941825  4.290648  6.497471 31.778496        29 
## 
## $X1992
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.00113  0.47138  2.01221  4.17483  6.48231 29.63244       26 
## 
## $X1993
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0011  0.4485  1.8676  4.0855  6.4309 33.1220      26 
## 
## $X1994
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##  0.001071  0.406659  1.830972  4.045067  6.337175 36.466263        26 
## 
## $X1995
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##  0.001043  0.423225  1.832534  4.136628  6.360589 37.102174        25 
## 
## $X1996
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.4619  1.8750  4.2111  6.5817 40.0742      25 
## 
## $X1997
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##  0.000991  0.475029  2.034502  4.250716  6.478028 47.429575        25 
## 
## $X1998
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03557  0.49675  2.15638  4.28281  6.36991 48.04502       26 
## 
## $X1999
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0367  0.5090  2.1323  4.2657  6.2332 50.8338      26 
## 
## $X2000
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0346  0.5143  2.1680  4.2444  6.0749 48.3740      25 
## 
## $X2001
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03065  0.55498  2.20486  4.32311  6.22776 46.58438       25 
## 
## $X2002
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03132  0.57494  2.29148  4.34969  6.39291 50.71216       24 
## 
## $X2003
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02316  0.57187  2.17408  4.46685  6.81868 50.95403       24 
## 
## $X2004
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02103  0.58591  2.26863  4.50548  6.57848 49.20805       24 
## 
## $X2005
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02037  0.63786  2.33715  4.52585  6.51468 44.53381       24 
## 
## $X2006
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02366  0.62507  2.40391  4.59021  6.55718 42.79831       24 
## 
## $X2007
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02289  0.62077  2.49795  4.55871  6.69946 40.89652       24 
## 
## $X2008
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02215  0.58926  2.42452  4.53254  6.57775 36.91871       24 
## 
## $X2009
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02143  0.60261  2.57505  4.63638  6.88817 32.67180       24 
## 
## $X2010
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03458  0.61466  2.52286  4.39945  6.15996 32.63430       24 
## 
## $X2011
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.04019  0.63099  2.49273  4.39217  6.26989 33.49441       24 
## 
## $X2012
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.04002  0.63350  2.56200  4.44040  6.33634 34.18822       24 
## 
## $X2013
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02698  0.67563  2.54269  4.32668  6.06988 32.59894       24 
## 
## $X2014
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.02912  0.66928  2.62039  4.22697  5.81403 33.20589       24 
## 
## $X2015
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03937  0.66734  2.57930  4.18413  5.77668 33.04351       24 
## 
## $X2016
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03071  0.72849  2.59779  4.19543  5.80213 32.74589       24 
## 
## $X2017
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03501  0.79195  2.60998  4.19980  5.83406 32.12799       24 
## 
## $X2018
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03711  0.79561  2.58448  4.16497  5.77019 31.06753       24 
## 
## $X2019
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03699  0.80176  2.71762  4.11514  5.58516 32.47447       24 
## 
## $X2019.1
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  0.03699  0.80176  2.71762  4.11514  5.58516 32.47447       24
lapply(co2[, 5:35], function(x) sum(is.na(x)))
## $X1990
## [1] 30
## 
## $X1991
## [1] 29
## 
## $X1992
## [1] 26
## 
## $X1993
## [1] 26
## 
## $X1994
## [1] 26
## 
## $X1995
## [1] 25
## 
## $X1996
## [1] 25
## 
## $X1997
## [1] 25
## 
## $X1998
## [1] 26
## 
## $X1999
## [1] 26
## 
## $X2000
## [1] 25
## 
## $X2001
## [1] 25
## 
## $X2002
## [1] 24
## 
## $X2003
## [1] 24
## 
## $X2004
## [1] 24
## 
## $X2005
## [1] 24
## 
## $X2006
## [1] 24
## 
## $X2007
## [1] 24
## 
## $X2008
## [1] 24
## 
## $X2009
## [1] 24
## 
## $X2010
## [1] 24
## 
## $X2011
## [1] 24
## 
## $X2012
## [1] 24
## 
## $X2013
## [1] 24
## 
## $X2014
## [1] 24
## 
## $X2015
## [1] 24
## 
## $X2016
## [1] 24
## 
## $X2017
## [1] 24
## 
## $X2018
## [1] 24
## 
## $X2019
## [1] 24
## 
## $X2019.1
## [1] 24