setwd("D:\\AUCA\\R for data science")
getwd()
## [1] "D:/AUCA/R for data science"
list.files()
## [1] "20251MBI024_in"
## [2] "230201_MS_BIG_DATA_PROJECT_2_PN_HD.pdf"
## [3] "archive"
## [4] "archive.zip"
## [5] "class 2.Rmd"
## [6] "Class 2_Cont.Rmd"
## [7] "Class 3 and 4.Rmd"
## [8] "Class1.Rmd"
## [9] "Class1_2026_materials.pdf"
## [10] "class2.Rmd"
## [11] "Class4.html"
## [12] "CO2_emission.csv"
## [13] "exercise.aux"
## [14] "exercise.log"
## [15] "exercise.pdf"
## [16] "exercise.synctex.gz"
## [17] "exercise.tex"
## [18] "exercise.toc"
## [19] "practice.Rmd"
## [20] "R_PROGRAMMING_FOR_DATA_SCIENCE_2014_-_Roger_D_Peng.pdf"
## [21] "world_population.csv"
#load data sets
population <- read.csv("world_population.csv")
co2 <- read.csv("CO2_emission.csv")
#display variable name
colnames(population)
## [1] "Rank" "CCA3"
## [3] "Country.Territory" "Capital"
## [5] "Continent" "X2022.Population"
## [7] "X2020.Population" "X2015.Population"
## [9] "X2010.Population" "X2000.Population"
## [11] "X1990.Population" "X1980.Population"
## [13] "X1970.Population" "Area..km.."
## [15] "Density..per.km.." "Growth.Rate"
## [17] "World.Population.Percentage"
#the top 5 rows
head(population, 5)
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 38972230 33753499 28189672 19542982
## 2 2866849 2882481 2913399 3182021
## 3 43451666 39543154 35856344 30774621
## 4 46189 51368 54849 58230
## 5 77700 71746 71519 66097
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 10694796 12486631 10752971 652230
## 2 3295066 2941651 2324731 28748
## 3 25518074 18739378 13795915 2381741
## 4 47818 32886 27075 199
## 5 53569 35611 19860 468
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 63.0587 1.0257 0.52
## 2 98.8702 0.9957 0.04
## 3 18.8531 1.0164 0.56
## 4 222.4774 0.9831 0.00
## 5 170.5641 1.0100 0.00
#the bottom 10 rows
tail(population, 10)
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 225 43 UZB Uzbekistan Tashkent Asia 34627652
## 226 181 VUT Vanuatu Port-Vila Oceania 326740
## 227 234 VAT Vatican City Vatican City Europe 510
## 228 51 VEN Venezuela Caracas South America 28301696
## 229 16 VNM Vietnam Hanoi Asia 98186856
## 230 226 WLF Wallis and Futuna Mata-Utu Oceania 11572
## 231 172 ESH Western Sahara El Aaiún Africa 575986
## 232 46 YEM Yemen Sanaa Asia 33696614
## 233 63 ZMB Zambia Lusaka Africa 20017675
## 234 74 ZWE Zimbabwe Harare Africa 16320537
## X2020.Population X2015.Population X2010.Population X2000.Population
## 225 33526656 30949417 28614227 24925554
## 226 311685 276438 245453 192074
## 227 520 564 596 651
## 228 28490453 30529716 28715022 24427729
## 229 96648685 92191398 87411012 79001142
## 230 11655 12182 13142 14723
## 231 556048 491824 413296 270375
## 232 32284046 28516545 24743946 18628700
## 233 18927715 16248230 13792086 9891136
## 234 15669666 14154937 12839771 11834676
## X1990.Population X1980.Population X1970.Population Area..km..
## 225 20579100 15947129 12011361 447400
## 226 150882 118156 87019 12189
## 227 700 733 752 1
## 228 19750579 15210443 11355475 916445
## 229 66912613 52968270 41928849 331212
## 230 13454 11315 9377 142
## 231 178529 116775 76371 266000
## 232 13375121 9204938 6843607 527968
## 233 7686401 5720438 4281671 752612
## 234 10113893 7049926 5202918 390757
## Density..per.km.. Growth.Rate World.Population.Percentage
## 225 77.3975 1.0160 0.43
## 226 26.8061 1.0238 0.00
## 227 510.0000 0.9980 0.00
## 228 30.8820 1.0036 0.35
## 229 296.4472 1.0074 1.23
## 230 81.4930 0.9953 0.00
## 231 2.1654 1.0184 0.01
## 232 63.8232 1.0217 0.42
## 233 26.5976 1.0280 0.25
## 234 41.7665 1.0204 0.20
#the data type
str(population)
## 'data.frame': 234 obs. of 17 variables:
## $ Rank : int 36 138 34 213 203 42 224 201 33 140 ...
## $ CCA3 : chr "AFG" "ALB" "DZA" "ASM" ...
## $ Country.Territory : chr "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## $ Capital : chr "Kabul" "Tirana" "Algiers" "Pago Pago" ...
## $ Continent : chr "Asia" "Europe" "Africa" "Oceania" ...
## $ X2022.Population : int 41128771 2842321 44903225 44273 79824 35588987 15857 93763 45510318 2780469 ...
## $ X2020.Population : int 38972230 2866849 43451666 46189 77700 33428485 15585 92664 45036032 2805608 ...
## $ X2015.Population : int 33753499 2882481 39543154 51368 71746 28127721 14525 89941 43257065 2878595 ...
## $ X2010.Population : int 28189672 2913399 35856344 54849 71519 23364185 13172 85695 41100123 2946293 ...
## $ X2000.Population : int 19542982 3182021 30774621 58230 66097 16394062 11047 75055 37070774 3168523 ...
## $ X1990.Population : int 10694796 3295066 25518074 47818 53569 11828638 8316 63328 32637657 3556539 ...
## $ X1980.Population : int 12486631 2941651 18739378 32886 35611 8330047 6560 64888 28024803 3135123 ...
## $ X1970.Population : int 10752971 2324731 13795915 27075 19860 6029700 6283 64516 23842803 2534377 ...
## $ Area..km.. : int 652230 28748 2381741 199 468 1246700 91 442 2780400 29743 ...
## $ Density..per.km.. : num 63.1 98.9 18.9 222.5 170.6 ...
## $ Growth.Rate : num 1.026 0.996 1.016 0.983 1.01 ...
## $ World.Population.Percentage: num 0.52 0.04 0.56 0 0 0.45 0 0 0.57 0.03 ...
#the shape of the dataset
dim(population)
## [1] 234 17
#check duplicate
sum(duplicated(population))
## [1] 0
population <- population[!duplicated(population), ]
# find the number of missing values in each column
colSums(is.na(population))
## Rank CCA3
## 0 0
## Country.Territory Capital
## 0 0
## Continent X2022.Population
## 0 0
## X2020.Population X2015.Population
## 0 0
## X2010.Population X2000.Population
## 0 0
## X1990.Population X1980.Population
## 0 0
## X1970.Population Area..km..
## 0 0
## Density..per.km.. Growth.Rate
## 0 0
## World.Population.Percentage
## 0
#Boxplot for outliers
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
numeric_cols <- population %>% select(where(is.numeric))
boxplot(numeric_cols, main = "Boxplot of Numeric Variables", las = 2)
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
## Warning in x[floor(d)] + x[ceiling(d)]: NAs produced by integer overflow
colSums(is.na(population))
## Rank CCA3
## 0 0
## Country.Territory Capital
## 0 0
## Continent X2022.Population
## 0 0
## X2020.Population X2015.Population
## 0 0
## X2010.Population X2000.Population
## 0 0
## X1990.Population X1980.Population
## 0 0
## X1970.Population Area..km..
## 0 0
## Density..per.km.. Growth.Rate
## 0 0
## World.Population.Percentage
## 0
sum(duplicated(population))
## [1] 0
#Generating newVariable by using World Population Dataset
summary(population$Growth.Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.912 1.002 1.008 1.010 1.017 1.069
t <- 8
population$Population_2030 <- population$X2022.Population *
(population$Growth.Rate ^ 8)
head(population[, c("Country.Territory", "X2022.Population", "Growth.Rate", "Population_2030")])
## Country.Territory X2022.Population Growth.Rate Population_2030
## 1 Afghanistan 41128771 1.0257 50385848.48
## 2 Albania 2842321 0.9957 2746004.10
## 3 Algeria 44903225 1.0164 51144010.98
## 4 American Samoa 44273 0.9831 38629.63
## 5 Andorra 79824 1.0100 86437.95
## 6 Angola 35588987 1.0315 45610988.54
summary(population$Growth.Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.912 1.002 1.008 1.010 1.017 1.069
t <- 8
population$r <- log(population$Growth.Rate)
population$Population_2030 <- population$X2022.Population *
exp(population$r * t)
head(population[, c("Country.Territory", "X2022.Population", "Growth.Rate", "Population_2030")])
## Country.Territory X2022.Population Growth.Rate Population_2030
## 1 Afghanistan 41128771 1.0257 50385848.48
## 2 Albania 2842321 0.9957 2746004.10
## 3 Algeria 44903225 1.0164 51144010.98
## 4 American Samoa 44273 0.9831 38629.63
## 5 Andorra 79824 1.0100 86437.95
## 6 Angola 35588987 1.0315 45610988.54
#Value extraction and plot
top10 <- population %>%
arrange(desc(X2022.Population)) %>%
slice(1:10)
top10$Country.Territory
## [1] "China" "India" "United States" "Indonesia"
## [5] "Pakistan" "Nigeria" "Brazil" "Bangladesh"
## [9] "Russia" "Mexico"
ggplot(top10, aes(x = reorder(Country.Territory, X2022.Population),
y = X2022.Population)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Most Populous Countries (2022)",
x = "Country",
y = "Population") +
theme_minimal()
top10_trend <- top10 %>%
select(Country.Territory,
X1990.Population,
X2000.Population,
X2010.Population,
X2015.Population,
X2020.Population,
X2022.Population)
top10_long <- top10_trend %>%
pivot_longer(cols = -Country.Territory,
names_to = "Year",
values_to = "Population")
top10_long$Year <- gsub("X", "", top10_long$Year)
top10_long$Year <- gsub(".Population", "", top10_long$Year)
top10_long$Year <- as.numeric(top10_long$Year)
ggplot(top10_long, aes(x = Year,
y = Population,
color = Country.Territory)) +
geom_line() +
geom_point() +
labs(title = "Population Trend of Top 10 Countries (1990–2022)",
x = "Year",
y = "Population") +
theme_minimal()
ggplot(top10_long, aes(x = Year,
y = Population,
color = Country.Territory)) +
geom_line() +
geom_point() +
labs(title = "Population Trend of Top 10 Countries (1990–2022)",
x = "Year",
y = "Population") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
ggplot(top10_long, aes(x = Year,
y = Population / 1e6,
color = Country.Territory)) +
geom_line() +
geom_point() +
labs(title = "Population Trend of Top 10 Countries (1990–2022)",
x = "Year",
y = "Population (Millions)") +
theme_minimal()
colnames(co2)
## [1] "Country.Name" "country_code" "Region" "Indicator.Name"
## [5] "X1990" "X1991" "X1992" "X1993"
## [9] "X1994" "X1995" "X1996" "X1997"
## [13] "X1998" "X1999" "X2000" "X2001"
## [17] "X2002" "X2003" "X2004" "X2005"
## [21] "X2006" "X2007" "X2008" "X2009"
## [25] "X2010" "X2011" "X2012" "X2013"
## [29] "X2014" "X2015" "X2016" "X2017"
## [33] "X2018" "X2019" "X2019.1"
top10$Country.Territory %in% co2$Country.Name
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
top10$Country.Territory[!(top10$Country.Territory %in% co2$Country.Name)]
## [1] "Russia"
co2$Country.Name[co2$Country.Name == "Russian Federation"] <- "Russia"
top10$Country.Territory %in% co2$Country.Name
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
co2_top10 <- co2 %>%
filter(Country.Name %in% top10$Country.Territory)
co2_trend <- co2_top10 %>%
select(Country.Name, starts_with("X"))
co2_long <- co2_trend %>%
pivot_longer(cols = -Country.Name,
names_to = "Year",
values_to = "CO2")
co2_long$Year <- gsub("X", "", co2_long$Year)
co2_long$Year <- as.numeric(co2_long$Year)
ggplot(co2_long, aes(x = Year, y = CO2, color = Country.Name)) +
geom_line() +
labs(title = "CO2 Emissions Trend (1990–2019)",
x = "Year",
y = "CO2 Emissions") +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
co2_long %>%
group_by(Country.Name) %>%
summarise(max_CO2 = max(CO2, na.rm = TRUE)) %>%
arrange(desc(max_CO2))
## # A tibble: 10 × 2
## Country.Name max_CO2
## <chr> <dbl>
## 1 United States 20.5
## 2 Russia 14.6
## 3 China 7.61
## 4 Mexico 4.19
## 5 Brazil 2.52
## 6 Indonesia 2.29
## 7 India 1.81
## 8 Pakistan 0.956
## 9 Nigeria 0.917
## 10 Bangladesh 0.557
str(co2_long)
## tibble [310 × 3] (S3: tbl_df/tbl/data.frame)
## $ Country.Name: chr [1:310] "Bangladesh" "Bangladesh" "Bangladesh" "Bangladesh" ...
## $ Year : num [1:310] 1990 1991 1992 1993 1994 ...
## $ CO2 : num [1:310] 0.112 0.103 0.109 0.114 0.12 ...
unique(co2$Country.Name)
## [1] "Aruba" "Afghanistan"
## [3] "Angola" "Albania"
## [5] "Andorra" "United Arab Emirates"
## [7] "Argentina" "Armenia"
## [9] "American Samoa" "Antigua and Barbuda"
## [11] "Australia" "Austria"
## [13] "Azerbaijan" "Burundi"
## [15] "Belgium" "Benin"
## [17] "Burkina Faso" "Bangladesh"
## [19] "Bulgaria" "Bahrain"
## [21] "Bahamas, The" "Bosnia and Herzegovina"
## [23] "Belarus" "Belize"
## [25] "Bermuda" "Bolivia"
## [27] "Brazil" "Barbados"
## [29] "Brunei Darussalam" "Bhutan"
## [31] "Botswana" "Central African Republic"
## [33] "Canada" "Switzerland"
## [35] "Chile" "China"
## [37] "Cote d'Ivoire" "Cameroon"
## [39] "Congo, Dem. Rep." "Congo, Rep."
## [41] "Colombia" "Comoros"
## [43] "Cabo Verde" "Costa Rica"
## [45] "Cuba" "Curacao"
## [47] "Cayman Islands" "Cyprus"
## [49] "Czech Republic" "Germany"
## [51] "Djibouti" "Dominica"
## [53] "Denmark" "Dominican Republic"
## [55] "Algeria" "Ecuador"
## [57] "Egypt, Arab Rep." "Eritrea"
## [59] "Spain" "Estonia"
## [61] "Ethiopia" "Finland"
## [63] "Fiji" "France"
## [65] "Faroe Islands" "Micronesia, Fed. Sts."
## [67] "Gabon" "United Kingdom"
## [69] "Georgia" "Ghana"
## [71] "Gibraltar" "Guinea"
## [73] "Gambia, The" "Guinea-Bissau"
## [75] "Equatorial Guinea" "Greece"
## [77] "Grenada" "Greenland"
## [79] "Guatemala" "Guam"
## [81] "Guyana" "Hong Kong SAR, China"
## [83] "Honduras" "Croatia"
## [85] "Haiti" "Hungary"
## [87] "Indonesia" "Isle of Man"
## [89] "India" "Ireland"
## [91] "Iran, Islamic Rep." "Iraq"
## [93] "Iceland" "Israel"
## [95] "Italy" "Jamaica"
## [97] "Jordan" "Japan"
## [99] "Kazakhstan" "Kenya"
## [101] "Kyrgyz Republic" "Cambodia"
## [103] "Kiribati" "St. Kitts and Nevis"
## [105] "Korea, Rep." "Kuwait"
## [107] "Lao PDR" "Lebanon"
## [109] "Liberia" "Libya"
## [111] "St. Lucia" "Liechtenstein"
## [113] "Sri Lanka" "Lesotho"
## [115] "Lithuania" "Luxembourg"
## [117] "Latvia" "Macao SAR, China"
## [119] "St. Martin (French part)" "Morocco"
## [121] "Monaco" "Moldova"
## [123] "Madagascar" "Maldives"
## [125] "Mexico" "Marshall Islands"
## [127] "North Macedonia" "Mali"
## [129] "Malta" "Myanmar"
## [131] "Montenegro" "Mongolia"
## [133] "Northern Mariana Islands" "Mozambique"
## [135] "Mauritania" "Mauritius"
## [137] "Malawi" "Malaysia"
## [139] "Namibia" "New Caledonia"
## [141] "Niger" "Nigeria"
## [143] "Nicaragua" "Netherlands"
## [145] "Norway" "Nepal"
## [147] "Nauru" "New Zealand"
## [149] "Oman" "Pakistan"
## [151] "Panama" "Peru"
## [153] "Philippines" "Palau"
## [155] "Papua New Guinea" "Poland"
## [157] "Puerto Rico" "Korea, Dem. People's Rep."
## [159] "Portugal" "Paraguay"
## [161] "West Bank and Gaza" "French Polynesia"
## [163] "Qatar" "Romania"
## [165] "Russia" "Rwanda"
## [167] "Saudi Arabia" "Sudan"
## [169] "Senegal" "Singapore"
## [171] "Solomon Islands" "Sierra Leone"
## [173] "El Salvador" "San Marino"
## [175] "Somalia" "Serbia"
## [177] "South Sudan" "Sao Tome and Principe"
## [179] "Suriname" "Slovak Republic"
## [181] "Slovenia" "Sweden"
## [183] "Eswatini" "Sint Maarten (Dutch part)"
## [185] "Seychelles" "Syrian Arab Republic"
## [187] "Turks and Caicos Islands" "Chad"
## [189] "Togo" "Thailand"
## [191] "Tajikistan" "Turkmenistan"
## [193] "Timor-Leste" "Tonga"
## [195] "Trinidad and Tobago" "Tunisia"
## [197] "Turkiye" "Tuvalu"
## [199] "Tanzania" "Uganda"
## [201] "Ukraine" "Uruguay"
## [203] "United States" "Uzbekistan"
## [205] "St. Vincent and the Grenadines" "Venezuela, RB"
## [207] "British Virgin Islands" "Virgin Islands (U.S.)"
## [209] "Vietnam" "Vanuatu"
## [211] "Samoa" "Yemen, Rep."
## [213] "South Africa" "Zambia"
## [215] "Zimbabwe"
top10$Country.Territory
## [1] "China" "India" "United States" "Indonesia"
## [5] "Pakistan" "Nigeria" "Brazil" "Bangladesh"
## [9] "Russia" "Mexico"
colnames(co2)
## [1] "Country.Name" "country_code" "Region" "Indicator.Name"
## [5] "X1990" "X1991" "X1992" "X1993"
## [9] "X1994" "X1995" "X1996" "X1997"
## [13] "X1998" "X1999" "X2000" "X2001"
## [17] "X2002" "X2003" "X2004" "X2005"
## [21] "X2006" "X2007" "X2008" "X2009"
## [25] "X2010" "X2011" "X2012" "X2013"
## [29] "X2014" "X2015" "X2016" "X2017"
## [33] "X2018" "X2019" "X2019.1"
str(co2)
## 'data.frame': 215 obs. of 35 variables:
## $ Country.Name : chr "Aruba" "Afghanistan" "Angola" "Albania" ...
## $ country_code : chr "ABW" "AFG" "AGO" "ALB" ...
## $ Region : chr "Latin America & Caribbean" "South Asia" "Sub-Saharan Africa" "Europe & Central Asia" ...
## $ Indicator.Name: chr "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" "CO2 emissions (metric tons per capita)" ...
## $ X1990 : num NA 0.192 0.554 1.82 7.522 ...
## $ X1991 : num NA 0.168 0.545 1.243 7.235 ...
## $ X1992 : num NA 0.096 0.544 0.684 6.963 ...
## $ X1993 : num NA 0.0847 0.709 0.6383 6.7242 ...
## $ X1994 : num NA 0.0755 0.8368 0.6454 6.5416 ...
## $ X1995 : num NA 0.0685 0.9121 0.6054 6.7335 ...
## $ X1996 : num NA 0.0626 1.0722 0.6124 6.9916 ...
## $ X1997 : num NA 0.0568 1.0866 0.4669 7.3074 ...
## $ X1998 : num NA 0.0527 1.0918 0.5722 7.6395 ...
## $ X1999 : num NA 0.0402 1.1099 0.9554 7.9232 ...
## $ X2000 : num NA 0.0366 0.9881 1.0262 7.9523 ...
## $ X2001 : num NA 0.0338 0.9418 1.0555 7.7215 ...
## $ X2002 : num NA 0.0456 0.8956 1.2324 7.5662 ...
## $ X2003 : num NA 0.0515 0.9249 1.339 7.2424 ...
## $ X2004 : num NA 0.0417 0.9303 1.4041 7.3443 ...
## $ X2005 : num NA 0.0604 0.8135 1.3382 7.3538 ...
## $ X2006 : num NA 0.0666 0.8218 1.34 6.7905 ...
## $ X2007 : num NA 0.0653 0.8118 1.3939 6.531 ...
## $ X2008 : num NA 0.128 0.889 1.384 6.439 ...
## $ X2009 : num NA 0.172 0.939 1.441 6.157 ...
## $ X2010 : num NA 0.244 0.976 1.528 6.157 ...
## $ X2011 : num NA 0.297 0.986 1.669 5.851 ...
## $ X2012 : num NA 0.259 0.951 1.503 5.945 ...
## $ X2013 : num NA 0.186 1.036 1.534 5.943 ...
## $ X2014 : num NA 0.146 1.1 1.668 5.807 ...
## $ X2015 : num NA 0.173 1.135 1.604 6.026 ...
## $ X2016 : num NA 0.15 1.03 1.56 6.08 ...
## $ X2017 : num NA 0.132 0.813 1.789 6.104 ...
## $ X2018 : num NA 0.163 0.778 1.783 6.363 ...
## $ X2019 : num NA 0.16 0.792 1.692 6.481 ...
## $ X2019.1 : num NA 0.16 0.792 1.692 6.481 ...
co2 <- read.csv("CO2_emission.csv")
dim(co2)
## [1] 215 35
top10$Country.Territory
## [1] "China" "India" "United States" "Indonesia"
## [5] "Pakistan" "Nigeria" "Brazil" "Bangladesh"
## [9] "Russia" "Mexico"
co2_top10 <- co2 %>%
filter(Country.Name %in% top10$Country.Territory)
dim(co2_top10)
## [1] 9 35
co2_trend <- co2_top10 %>%
select(Country.Name, starts_with("X"))
co2_long <- co2_trend %>%
pivot_longer(cols = -Country.Name,
names_to = "Year",
values_to = "CO2")
co2_long$Year <- as.numeric(gsub("X", "", co2_long$Year))
co2_long$CO2 <- as.numeric(co2_long$CO2)
co2_long <- na.omit(co2_long)
ggplot(co2_long, aes(x = Year, y = CO2, color = Country.Name)) +
geom_line() +
geom_point() +
theme_minimal()
co2_long$Country.Name <- factor(
co2_long$Country.Name,
levels = co2_long %>%
group_by(Country.Name) %>%
summarise(max_CO2 = max(CO2, na.rm = TRUE)) %>%
arrange(desc(max_CO2)) %>%
pull(Country.Name)
)
ggplot(co2_long, aes(x = Year, y = CO2, color = Country.Name)) +
geom_line() +
geom_point() +
theme_minimal()
#Correlation Analysis
corr_data <- population %>%
select(Area..km..,
Density..per.km..,
Growth.Rate,
World.Population.Percentage)
corr_data <- population %>%
select(Area..km..,
Density..per.km..,
Growth.Rate,
World.Population.Percentage)
cor_matrix <- cor(corr_data)
cor_matrix
## Area..km.. Density..per.km.. Growth.Rate
## Area..km.. 1.00000000 -0.06312785 -0.01397017
## Density..per.km.. -0.06312785 1.00000000 -0.06975328
## Growth.Rate -0.01397017 -0.06975328 1.00000000
## World.Population.Percentage 0.45328363 -0.02764600 -0.02092954
## World.Population.Percentage
## Area..km.. 0.45328363
## Density..per.km.. -0.02764600
## Growth.Rate -0.02092954
## World.Population.Percentage 1.00000000
library(ggplot2)
corr_long <- as.data.frame(as.table(cor_matrix))
ggplot(corr_long, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
geom_text(aes(label = round(Freq, 2))) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
theme_minimal() +
labs(title = "Correlation Heatmap")
pop_2022 <- population %>%
select(Country.Territory, X2022.Population)
co2_years <- co2[, grep("^X", colnames(co2))]
co2_years_num <- lapply(co2_years, as.numeric)
lapply(co2[, 5:35], summary)
## $X1990
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.001183 0.434249 1.914543 4.404504 6.910335 30.195189 30
##
## $X1991
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.001158 0.452759 1.941825 4.290648 6.497471 31.778496 29
##
## $X1992
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00113 0.47138 2.01221 4.17483 6.48231 29.63244 26
##
## $X1993
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0011 0.4485 1.8676 4.0855 6.4309 33.1220 26
##
## $X1994
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.001071 0.406659 1.830972 4.045067 6.337175 36.466263 26
##
## $X1995
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.001043 0.423225 1.832534 4.136628 6.360589 37.102174 25
##
## $X1996
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.4619 1.8750 4.2111 6.5817 40.0742 25
##
## $X1997
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000991 0.475029 2.034502 4.250716 6.478028 47.429575 25
##
## $X1998
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03557 0.49675 2.15638 4.28281 6.36991 48.04502 26
##
## $X1999
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0367 0.5090 2.1323 4.2657 6.2332 50.8338 26
##
## $X2000
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0346 0.5143 2.1680 4.2444 6.0749 48.3740 25
##
## $X2001
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03065 0.55498 2.20486 4.32311 6.22776 46.58438 25
##
## $X2002
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03132 0.57494 2.29148 4.34969 6.39291 50.71216 24
##
## $X2003
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02316 0.57187 2.17408 4.46685 6.81868 50.95403 24
##
## $X2004
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02103 0.58591 2.26863 4.50548 6.57848 49.20805 24
##
## $X2005
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02037 0.63786 2.33715 4.52585 6.51468 44.53381 24
##
## $X2006
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02366 0.62507 2.40391 4.59021 6.55718 42.79831 24
##
## $X2007
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02289 0.62077 2.49795 4.55871 6.69946 40.89652 24
##
## $X2008
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02215 0.58926 2.42452 4.53254 6.57775 36.91871 24
##
## $X2009
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02143 0.60261 2.57505 4.63638 6.88817 32.67180 24
##
## $X2010
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03458 0.61466 2.52286 4.39945 6.15996 32.63430 24
##
## $X2011
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.04019 0.63099 2.49273 4.39217 6.26989 33.49441 24
##
## $X2012
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.04002 0.63350 2.56200 4.44040 6.33634 34.18822 24
##
## $X2013
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02698 0.67563 2.54269 4.32668 6.06988 32.59894 24
##
## $X2014
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02912 0.66928 2.62039 4.22697 5.81403 33.20589 24
##
## $X2015
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03937 0.66734 2.57930 4.18413 5.77668 33.04351 24
##
## $X2016
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03071 0.72849 2.59779 4.19543 5.80213 32.74589 24
##
## $X2017
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03501 0.79195 2.60998 4.19980 5.83406 32.12799 24
##
## $X2018
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03711 0.79561 2.58448 4.16497 5.77019 31.06753 24
##
## $X2019
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03699 0.80176 2.71762 4.11514 5.58516 32.47447 24
##
## $X2019.1
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.03699 0.80176 2.71762 4.11514 5.58516 32.47447 24
lapply(co2[, 5:35], function(x) sum(is.na(x)))
## $X1990
## [1] 30
##
## $X1991
## [1] 29
##
## $X1992
## [1] 26
##
## $X1993
## [1] 26
##
## $X1994
## [1] 26
##
## $X1995
## [1] 25
##
## $X1996
## [1] 25
##
## $X1997
## [1] 25
##
## $X1998
## [1] 26
##
## $X1999
## [1] 26
##
## $X2000
## [1] 25
##
## $X2001
## [1] 25
##
## $X2002
## [1] 24
##
## $X2003
## [1] 24
##
## $X2004
## [1] 24
##
## $X2005
## [1] 24
##
## $X2006
## [1] 24
##
## $X2007
## [1] 24
##
## $X2008
## [1] 24
##
## $X2009
## [1] 24
##
## $X2010
## [1] 24
##
## $X2011
## [1] 24
##
## $X2012
## [1] 24
##
## $X2013
## [1] 24
##
## $X2014
## [1] 24
##
## $X2015
## [1] 24
##
## $X2016
## [1] 24
##
## $X2017
## [1] 24
##
## $X2018
## [1] 24
##
## $X2019
## [1] 24
##
## $X2019.1
## [1] 24