##=======assgnmt 1.Importing datasets from different sources===##
library(readr)
data("airquality")
View(airquality)
variable.names(airquality)
## [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day"
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
###IMPORTING DATASETS FROM DBMS
library(DBI)
library(RMySQL)
#Create connection
con<-dbConnect(MySQL(),
user="root",
password="",
dbname="r_testing",
host="127.0.0.1")
datatable<-dbReadTable(con,"students")
dbIsValid(con)
## [1] TRUE
dbListTables(con)
## [1] "students"
View(datatable)
apply(datatable[,c("ages","score")],2,mean)
## ages score
## 21.75 16.25
##Importing datasets from kaggle
ai_student_impact_dataset_1_ <- read_csv("ai_student_impact_dataset (1).csv")
## Rows: 50000 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Major_Category, Year_of_Study, Primary_Use_Case, Prompt_Engineering...
## dbl (9): Student_ID, Pre_Semester_GPA, Weekly_GenAI_Hours, Tool_Diversity, T...
## lgl (1): Paid_Subscription
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(ai_student_impact_dataset_1_)
##===assignmt 2 Marging datasets using 2&3 variables===##
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ purrr 1.2.2
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
CO2 <- read.csv("~/myprojectstudy/CO2_emission.csv")
View(CO2)
population <- read.csv("~/myprojectstudy/world_population.csv")
View(population)
variable.names(CO2)
## [1] "Country.Name" "country_code" "Region" "Indicator.Name"
## [5] "X1990" "X1991" "X1992" "X1993"
## [9] "X1994" "X1995" "X1996" "X1997"
## [13] "X1998" "X1999" "X2000" "X2001"
## [17] "X2002" "X2003" "X2004" "X2005"
## [21] "X2006" "X2007" "X2008" "X2009"
## [25] "X2010" "X2011" "X2012" "X2013"
## [29] "X2014" "X2015" "X2016" "X2017"
## [33] "X2018" "X2019" "X2019.1"
variable.names(population)
## [1] "Rank" "CCA3"
## [3] "Country.Territory" "Capital"
## [5] "Continent" "X2022.Population"
## [7] "X2020.Population" "X2015.Population"
## [9] "X2010.Population" "X2000.Population"
## [11] "X1990.Population" "X1980.Population"
## [13] "X1970.Population" "Area..km.."
## [15] "Density..per.km.." "Growth.Rate"
## [17] "World.Population.Percentage"
#MERGING
merged <- merge(
population,
CO2,
by.x = c("Continent", "Country.Territory"),
by.y = c("Region", "Country.Name")
)
colnames(merged)
## [1] "Continent" "Country.Territory"
## [3] "Rank" "CCA3"
## [5] "Capital" "X2022.Population"
## [7] "X2020.Population" "X2015.Population"
## [9] "X2010.Population" "X2000.Population"
## [11] "X1990.Population" "X1980.Population"
## [13] "X1970.Population" "Area..km.."
## [15] "Density..per.km.." "Growth.Rate"
## [17] "World.Population.Percentage" "country_code"
## [19] "Indicator.Name" "X1990"
## [21] "X1991" "X1992"
## [23] "X1993" "X1994"
## [25] "X1995" "X1996"
## [27] "X1997" "X1998"
## [29] "X1999" "X2000"
## [31] "X2001" "X2002"
## [33] "X2003" "X2004"
## [35] "X2005" "X2006"
## [37] "X2007" "X2008"
## [39] "X2009" "X2010"
## [41] "X2011" "X2012"
## [43] "X2013" "X2014"
## [45] "X2015" "X2016"
## [47] "X2017" "X2018"
## [49] "X2019" "X2019.1"
head(merged)
## Continent Country.Territory Rank CCA3 Capital X2022.Population
## 1 North America Bermuda 206 BMU Hamilton 64184
## 2 North America Canada 39 CAN Ottawa 38454327
## 3 North America United States 3 USA Washington, D.C. 338289857
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 64031 63144 63447 61371
## 2 37888705 35732126 33963412 30683313
## 3 335942003 324607776 311182845 282398554
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 57470 53565 52019 54
## 2 27657204 24511510 21434577 9984670
## 3 248083732 223140018 200328340 9372610
## Density..per.km.. Growth.Rate World.Population.Percentage country_code
## 1 1188.5926 1.0000 0.00 BMU
## 2 3.8513 1.0078 0.48 CAN
## 3 36.0935 1.0038 4.24 USA
## Indicator.Name X1990 X1991 X1992 X1993
## 1 CO2 emissions (metric tons per capita) NA NA NA NA
## 2 CO2 emissions (metric tons per capita) 15.14889 14.74101 15.02823 14.71339
## 3 CO2 emissions (metric tons per capita) 19.40735 19.00340 19.02286 19.21833
## X1994 X1995 X1996 X1997 X1998 X1999 X2000 X2001
## 1 NA NA NA NA NA NA NA NA
## 2 15.06035 15.29060 15.59259 15.94399 16.07651 16.25852 16.75763 16.33157
## 3 19.25621 19.21691 19.57538 20.33086 20.26630 20.10113 20.46981 20.17153
## X2002 X2003 X2004 X2005 X2006 X2007 X2008 X2009
## 1 NA NA NA NA NA NA NA NA
## 2 16.72030 17.2083 16.79427 17.02749 16.59535 17.38057 16.55692 15.50215
## 3 19.44555 19.5065 19.59763 19.46926 18.94591 19.04291 18.27849 16.80870
## X2010 X2011 X2012 X2013 X2014 X2015 X2016 X2017
## 1 NA NA NA NA NA NA NA NA
## 2 15.79214 15.99594 15.73447 15.83846 15.84991 15.64859 15.42060 15.54457
## 3 17.43174 16.60422 15.78978 16.11118 16.04092 15.56003 15.14989 14.82326
## X2018 X2019 X2019.1
## 1 NA NA NA
## 2 15.65058 15.43061 15.43061
## 3 15.22255 14.67341 14.67341
##=====assignmt 3.Use of group-by and %>% =====##
#Instead of nesting functions inside each other, the pipe lets you write steps from left to right, top to bottom — the way you read. Each step receives the output of the previous step as its input data,Without pipe — hard to read, inside-out logic
#but group_by it tells every function that comes after it: “apply yourself within each group separately, not across the whole table.” By itself it does nothing useful — its power comes from what follows it
merged %>%
group_by(Continent) %>%
summarise(
avg_CO2 = mean(X2019, na.rm = TRUE),
total_pop = sum(X2022.Population, na.rm = TRUE),
country_count = n()
) %>%
arrange(desc(avg_CO2))
## # A tibble: 1 × 4
## Continent avg_CO2 total_pop country_count
## <chr> <dbl> <int> <int>
## 1 North America 15.1 376808368 3
##====assignmt 4.using trace and recover debugging====##
Trace:Injects code into a function without editing it. Runs your injected code every time that function is called. Recover:When an error occurs, drops you into any frame of the call stack so you can inspect variables at any level
# A simple function
add_numbers <- function(x, y) {
x + y
}
# Inject a print at the start of add_numbers
trace("add_numbers", quote(cat("Called with x =", x, "y =", y, "\n")))
## [1] "add_numbers"
add_numbers(3, 5)
## Tracing add_numbers(3, 5) on entry
## Called with x = 3 y = 5
## [1] 8
add_numbers(10, -2)
## Tracing add_numbers(10, -2) on entry
## Called with x = 10 y = -2
## [1] 8
# When an error occurs, instead of just printing the error message and stopping, recover() pauses execution and shows you the entire call stack. You pick any frame to enter and inspect variables there — like rewinding time to any point before the crash
# Three nested functions
outer_fn <- function(x) {
result <- middle_fn(x + 1)
result * 2
}
middle_fn <- function(x) {
result <- inner_fn(x)
result + 5
}
inner_fn <- function(x) {
log("not a number") # BUG: passing a string to log()
}
# Turn on: errors will now open the call stack browser
#options(error = recover)
options(error = recover)
#Error: unexpected symbol in:
#"outer_fn(3)
#options"
# Turn off: back to normal error messages
options(error = NULL)
##====assignmt 5.creating a function to define a mean=====##
data_mean <- function(x) {
# Input validation
if (!is.numeric(x)) {
stop("Input must be a numeric vector")
}
if (length(x) == 0) {
stop("Input vector cannot be empty")
}
total <- sum(x)
count <- length(x)
result <- total / count
return(result)
}
data_mean(population$X2020.Population)
## [1] 33501071
##=====6. sapply() and vapply(), map() and mapply()====##
#It applies a function to each element of a vector, list, or data frame and tries to simplify the output into a vector or matrix.
vapply(): means vector apply.
Applies a function to each element. Checks whether the returned result matches the specified type. Produces an error if types differ
map(): is from the purrr package in the tidyverse.
It applies a function to each element of a list or vector.
mapply() means multivariate apply.
It applies a function simultaneously to multiple vectors or lists
#sapply
years <- c( "X2015", "X2016","X2017","X2018","X2019")
sapply(CO2[years], mean, na.rm = TRUE)
## X2015 X2016 X2017 X2018 X2019
## 4.184129 4.195432 4.199802 4.164970 4.115138
# vapply
# Load purrr
library(purrr)
# map
map_dbl(CO2[years], max, na.rm = TRUE)
## X2015 X2016 X2017 X2018 X2019
## 33.04351 32.74589 32.12799 31.06753 32.47447
# mapply
mapply(sum,
CO2$`X2018`,
CO2$`X2019`,
na.rm = TRUE)
## [1] 0.00000000 0.32311967 1.56981200 3.47498727 12.84419283 37.72024134
## [7] 7.71642226 4.02051226 0.00000000 10.65141659 30.73179640 14.42704817
## [13] 6.83523018 0.12418554 16.29192254 1.26464186 0.48248440 1.07364246
## [19] 11.43289089 39.89730848 14.55241272 13.18210069 12.36774508 3.28415374
## [25] 0.00000000 3.98652793 4.12966552 8.68106588 33.70955495 2.76781910
## [31] 6.39013398 0.09986644 31.08119187 8.76103245 9.44545665 15.09326886
## [37] 0.82760395 0.74473508 0.07409836 2.58683519 3.19239852 0.73651380
## [43] 2.30376590 3.30665804 4.64191032 0.00000000 0.00000000 12.05296990
## [49] 18.51485436 16.44781711 0.84854235 4.74086410 10.82633206 4.91726429
## [55] 7.89775954 4.61098725 5.00286636 0.49781308 10.58392239 19.26098157
## [61] 0.32165360 15.45549455 3.67650907 9.04464425 0.00000000 3.09080248
## [67] 4.83240450 10.64826267 5.28568376 1.28636184 0.00000000 0.61135973
## [73] 0.48826906 0.34252300 8.85775417 11.65420747 5.81764448 0.00000000
## [79] 2.28366926 0.00000000 6.96563822 0.00000000 2.09686039 8.08294250
## [85] 0.59054706 9.49316039 4.44599474 0.00000000 3.61031662 14.98246700
## [91] 15.39088811 8.68560708 9.36811573 13.79580413 10.68818637 5.77943731
## [97] 4.92482729 17.34266164 23.30825373 0.76410546 3.33591826 1.87864863
## [103] 1.54217441 9.49933649 24.02457120 44.11162679 5.35130983 8.12826444
## [109] 0.47348308 16.85859831 5.86797793 7.63746998 2.09159859 0.70855352
## [115] 8.35649858 30.63663413 7.99762404 0.00000000 0.00000000 3.76285984
## [121] 0.00000000 6.47797987 0.28032546 8.04605945 7.10799519 6.14325194
## [127] 7.51274743 0.58329238 6.49156175 1.29274132 8.19770229 13.95076821
## [133] 0.00000000 0.48261150 1.71079914 6.55863141 0.15389818 15.67927485
## [139] 3.41943488 0.00000000 0.17911969 1.13465816 1.59089814 17.22092463
## [145] 13.79881264 1.01078833 11.19316554 13.43368738 30.93630825 1.76255283
## [151] 5.54995831 3.45210191 2.64430891 27.84602029 1.74213133 15.98725756
## [157] 0.00000000 4.16114118 9.15315396 2.38306702 0.00000000 0.00000000
## [163] 63.54200206 7.67848605 23.29377418 0.21019163 30.75096950 0.98065155
## [169] 1.29377533 16.71330846 1.08888035 0.22760569 2.26926254 0.00000000
## [175] 0.08865660 13.22246498 0.29489586 1.36092522 8.39514600 11.75705173
## [181] 13.29654826 6.94402966 1.63700324 0.00000000 12.24248805 3.12253951
## [187] 0.00000000 0.28258706 0.58604045 7.53846236 1.96437611 24.74860902
## [193] 0.96054259 3.08154699 25.11568562 5.14204105 9.75694376 1.72718813
## [199] 0.42772795 0.26951408 8.09636661 3.77082691 29.89595826 6.88826626
## [205] 4.61935915 8.51357769 0.00000000 0.00000000 6.47745010 1.41779483
## [211] 3.00075030 0.72170176 15.02341470 0.82620618 1.66038979
Conclusion
These functions are important in R programming because they:
simplify repetitive tasks, improve efficiency, reduce manual loops, support data analysis and research.
In CO₂ emission analysis, they help calculate:
averages, trends, comparisons, growth rates, summary statistics quickly and efficiently.