install.packages(“haven”) library(haven) install.packages(“DBI”) install.packages(“odbc”) install.packages(“RMySQL”) install.packages(“RMariaDB”) library(DBI) library(RMariaDB) install.packages(“tinytex”) tinytex::install_tinytex() install.packages(“writexl”) library(writexl) library(readxl) library(dplyr) library(ggplot2)
##dataset creation
employeedata <- data.frame(
ID = 1:10,
Name = c("Anna", "Ritha", "Oliver", "Jackson", "Jado", "Peter", "Keke", "Danny", "Alex", "Claude"),
Age = c(55, 30, 89, 20, 44, 26, 78, 64, 70, 50),
Salary = c(200, 500, 650, 800, 550, 400, 200, 700, 900, 300)
)
# Dataset display
print(employeedata)
## ID Name Age Salary
## 1 1 Anna 55 200
## 2 2 Ritha 30 500
## 3 3 Oliver 89 650
## 4 4 Jackson 20 800
## 5 5 Jado 44 550
## 6 6 Peter 26 400
## 7 7 Keke 78 200
## 8 8 Danny 64 700
## 9 9 Alex 70 900
## 10 10 Claude 50 300
##dataset reading
library(haven)
write_sav(employeedata, "employeedata.sav")
list.files()
## [1] "Assignment.Rmd" "Assignment.Rproj" "CO2_emission.csv"
## [4] "employeedata.sav" "loan_datasample.dta" "loan_dataset.dta"
## [7] "loan_dataset.xlsx" "world_population.csv"
read_data <- read_sav("employeedata.sav")
head(read_data)
## # A tibble: 6 × 4
## ID Name Age Salary
## <dbl> <chr> <dbl> <dbl>
## 1 1 Anna 55 200
## 2 2 Ritha 30 500
## 3 3 Oliver 89 650
## 4 4 Jackson 20 800
## 5 5 Jado 44 550
## 6 6 Peter 26 400
write_sav(employeedata, "C:/Users/hp/Documents/Assignment/employeedata.sav")
spss_data <- read_sav("C:/Users/hp/Documents/Assignment/employeedata.sav")
head(spss_data)
## # A tibble: 6 × 4
## ID Name Age Salary
## <dbl> <chr> <dbl> <dbl>
## 1 1 Anna 55 200
## 2 2 Ritha 30 500
## 3 3 Oliver 89 650
## 4 4 Jackson 20 800
## 5 5 Jado 44 550
## 6 6 Peter 26 400
set.seed(123)
n <- 100
loan_dataset <- data.frame(
LoanID = 1001:(1000 + n),
ClientID = sample(2001:3000, n),
ClientName = paste("Client", 1:n),
Gender = sample(c("Male", "Female"), n, replace = TRUE),
Age = sample(21:65, n, replace = TRUE),
LoanAmount = round(runif(n, 500000, 20000000), 0),
InterestRate = round(runif(n, 10, 20), 2),
LoanTermMonths = sample(c(12, 24, 36, 48, 60), n, replace = TRUE),
LoanType = sample(c("Personal", "Mortgage", "Business", "Agriculture"), n, replace = TRUE),
DisbursementDate = sample(seq(as.Date("2023-01-01"), as.Date("2025-01-01"), by="day"), n, replace = TRUE),
Status = sample(c("Active", "Closed", "Defaulted", "Pending"), n, replace = TRUE),
CollateralValue = round(runif(n, 0, 30000000), 0)
)
loan_dataset$MonthlyInstallment <- round(
(loan_dataset$LoanAmount * (1 + loan_dataset$InterestRate/100)) / loan_dataset$LoanTermMonths,
0
)
head(loan_dataset)
## LoanID ClientID ClientName Gender Age LoanAmount InterestRate LoanTermMonths
## 1 1001 2415 Client 1 Female 23 10543900 12.41 48
## 2 1002 2463 Client 2 Female 49 3679871 10.58 48
## 3 1003 2179 Client 3 Male 56 12627095 12.37 36
## 4 1004 2526 Client 4 Male 64 19726106 18.90 48
## 5 1005 2195 Client 5 Female 42 13541045 18.12 24
## 6 1006 2938 Client 6 Female 62 8668860 17.48 36
## LoanType DisbursementDate Status CollateralValue MonthlyInstallment
## 1 Agriculture 2023-02-15 Closed 22780071 246925
## 2 Personal 2023-03-05 Active 8156706 84775
## 3 Agriculture 2024-06-14 Defaulted 3467474 394141
## 4 Business 2023-08-17 Pending 880098 488632
## 5 Business 2024-09-01 Closed 24205659 666445
## 6 Business 2023-05-09 Closed 14403793 282894
write_dta(loan_dataset, "loan_dataset.dta")
list.files()
## [1] "Assignment.Rmd" "Assignment.Rproj" "CO2_emission.csv"
## [4] "employeedata.sav" "loan_datasample.dta" "loan_dataset.dta"
## [7] "loan_dataset.xlsx" "world_population.csv"
stata_data <- read_dta("loan_dataset.dta")
head(stata_data)
## # A tibble: 6 × 13
## LoanID ClientID ClientName Gender Age LoanAmount InterestRate LoanTermMonths
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 2415 Client 1 Female 23 10543900 12.4 48
## 2 1002 2463 Client 2 Female 49 3679871 10.6 48
## 3 1003 2179 Client 3 Male 56 12627095 12.4 36
## 4 1004 2526 Client 4 Male 64 19726106 18.9 48
## 5 1005 2195 Client 5 Female 42 13541045 18.1 24
## 6 1006 2938 Client 6 Female 62 8668860 17.5 36
## # ℹ 5 more variables: LoanType <chr>, DisbursementDate <date>, Status <chr>,
## # CollateralValue <dbl>, MonthlyInstallment <dbl>
write_dta(loan_dataset, "C:/Users/hp/Documents/Assignment/loan_datasample.dta")
##reading dataset from database (mysql)
library(DBI)
library(RMariaDB)
con_my <- dbConnect(
RMariaDB::MariaDB(),
dbname = "employee_loan",
host = "localhost",
user = "root",
password = "root",
port = 3306
)
db_data <- dbGetQuery(con_my, "SELECT * FROM loans")
head(db_data)
## LoanID ClientID ClientName Gender Age LoanAmount InterestRate LoanTermMonths
## 1 1001 2415 Client 1 Female 23 10543900 12.41 48
## 2 1002 2463 Client 2 Female 49 3679871 10.58 48
## 3 1003 2179 Client 3 Male 56 12627095 12.37 36
## 4 1004 2526 Client 4 Male 64 19726106 18.90 48
## 5 1005 2195 Client 5 Female 42 13541045 18.12 24
## 6 1006 2938 Client 6 Female 62 8668860 17.48 36
## LoanType
## 1 Agriculture
## 2 Personal
## 3 Agriculture
## 4 Business
## 5 Business
## 6 Business
dbDisconnect(con_my)
##Reading dataset from Excel
library(readxl)
read_data <- read_excel("C:/Users/hp/Documents/Assignment/loan_dataset.xlsx")
head(read_data)
## # A tibble: 6 × 9
## LoanID ClientID ClientName Gender Age LoanAmount InterestRate LoanTermMonths
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 2415 Client 1 Female 23 10543900 12.4 48
## 2 1002 2463 Client 2 Female 49 3679871 10.6 48
## 3 1003 2179 Client 3 Male 56 12627095 12.4 36
## 4 1004 2526 Client 4 Male 64 19726106 18.9 48
## 5 1005 2195 Client 5 Female 42 13541045 18.1 24
## 6 1006 2938 Client 6 Female 62 8668860 17.5 36
## # ℹ 1 more variable: LoanType <chr>
##Calculate Mean of Iris dataset
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
mean(iris$Petal.Length)
## [1] 3.758
mean(iris$Petal.Length[iris$Species=="setosa"])
## [1] 1.462
mean(iris$Petal.Length[iris$Species=="versicolor"])
## [1] 4.26
mean(iris$Petal.Length[iris$Species=="virginica"])
## [1] 5.552
##Dataset exploratory
population_ds <- read.csv("C:/Users/hp/Documents/Assignment/world_population.csv")
co2_data <- read.csv("C:/Users/hp/Documents/Assignment/CO2_emission.csv")
head(population_ds)
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 38972230 33753499 28189672 19542982
## 2 2866849 2882481 2913399 3182021
## 3 43451666 39543154 35856344 30774621
## 4 46189 51368 54849 58230
## 5 77700 71746 71519 66097
## 6 33428485 28127721 23364185 16394062
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 10694796 12486631 10752971 652230
## 2 3295066 2941651 2324731 28748
## 3 25518074 18739378 13795915 2381741
## 4 47818 32886 27075 199
## 5 53569 35611 19860 468
## 6 11828638 8330047 6029700 1246700
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 63.0587 1.0257 0.52
## 2 98.8702 0.9957 0.04
## 3 18.8531 1.0164 0.56
## 4 222.4774 0.9831 0.00
## 5 170.5641 1.0100 0.00
## 6 28.5466 1.0315 0.45
##Dataset exploratory
head(co2_data)
## Country.Name country_code Region
## 1 Aruba ABW Latin America & Caribbean
## 2 Afghanistan AFG South Asia
## 3 Angola AGO Sub-Saharan Africa
## 4 Albania ALB Europe & Central Asia
## 5 Andorra AND Europe & Central Asia
## 6 United Arab Emirates ARE Middle East & North Africa
## Indicator.Name X1990 X1991 X1992
## 1 CO2 emissions (metric tons per capita) NA NA NA
## 2 CO2 emissions (metric tons per capita) 0.1917451 0.1676816 0.09595774
## 3 CO2 emissions (metric tons per capita) 0.5536620 0.5445386 0.54355722
## 4 CO2 emissions (metric tons per capita) 1.8195416 1.2428102 0.68369983
## 5 CO2 emissions (metric tons per capita) 7.5218317 7.2353792 6.96307870
## 6 CO2 emissions (metric tons per capita) 30.1951886 31.7784962 29.08092584
## X1993 X1994 X1995 X1996 X1997 X1998
## 1 NA NA NA NA NA NA
## 2 0.08472111 0.07554583 0.06846796 0.06258803 0.05682662 0.05269086
## 3 0.70898423 0.83680440 0.91214149 1.07216847 1.08663697 1.09182531
## 4 0.63830704 0.64535519 0.60543625 0.61236736 0.46692147 0.57215370
## 5 6.72417752 6.54157891 6.73347949 6.99159455 7.30744115 7.63953851
## 6 29.27567777 30.84933296 31.12501806 30.92802588 30.48633262 29.66358052
## X1999 X2000 X2001 X2002 X2003 X2004
## 1 NA NA NA NA NA NA
## 2 0.04015697 0.0365737 0.03378536 0.04557366 0.05151838 0.04165539
## 3 1.10985966 0.9880774 0.94182891 0.89557767 0.92486944 0.93026295
## 4 0.95535931 1.0262131 1.05549588 1.23237878 1.33898498 1.40405869
## 5 7.92319165 7.9522863 7.72154906 7.56623988 7.24241557 7.34426233
## 6 28.88710798 27.0351591 29.43026994 28.50146173 27.96926982 27.03893822
## X2005 X2006 X2007 X2008 X2009 X2010
## 1 NA NA NA NA NA NA
## 2 0.06041878 0.06658329 0.06531235 0.1284166 0.1718624 0.2436140
## 3 0.81353929 0.82184008 0.81175351 0.8886580 0.9394040 0.9761842
## 4 1.33820940 1.33999574 1.39393137 1.3843112 1.4414936 1.5276237
## 5 7.35378001 6.79054277 6.53104692 6.4393039 6.1566875 6.1571978
## 6 25.38238104 22.93510429 21.37028576 22.0114692 19.8323489 19.0397698
## X2011 X2012 X2013 X2014 X2015 X2016 X2017
## 1 NA NA NA NA NA NA NA
## 2 0.2965062 0.2592953 0.1856237 0.1462356 0.1728967 0.1497893 0.1316946
## 3 0.9855223 0.9506959 1.0362939 1.0997791 1.1350441 1.0318113 0.8133007
## 4 1.6694232 1.5032405 1.5336300 1.6683374 1.6037751 1.5576644 1.7887861
## 5 5.8508861 5.9446542 5.9428004 5.8071277 6.0261818 6.0806003 6.1041339
## 6 18.5094574 19.2078011 20.0556476 20.0516980 21.0776420 21.4806686 20.7690223
## X2018 X2019 X2019.1
## 1 NA NA NA
## 2 0.1632953 0.1598244 0.1598244
## 3 0.7776749 0.7921371 0.7921371
## 4 1.7827389 1.6922483 1.6922483
## 5 6.3629754 6.4812174 6.4812174
## 6 18.3906781 19.3295633 19.3295633
##Merging dataset by using %>%
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
merge_data <- population_ds %>%
select(Country.Territory, Continent, X2022.Population) %>%
inner_join(
co2_data %>% select(Country.Name, co2_2019 = `X2019`),
by = c("Country.Territory" = "Country.Name")
)
head(merge_data)
## Country.Territory Continent X2022.Population co2_2019
## 1 Afghanistan Asia 41128771 0.1598244
## 2 Albania Europe 2842321 1.6922483
## 3 Algeria Africa 44903225 3.9776505
## 4 American Samoa Oceania 44273 NA
## 5 Andorra Europe 79824 6.4812174
## 6 Angola Africa 35588987 0.7921371
##Sum the 2019 CO2 emissions dataSets per continent by using Group By and marge dataset by using %>%
co2_cont <- merge_data %>% group_by(Continent) %>%
summarise(Total_CO2 = sum(co2_2019, na.rm = TRUE)) %>% arrange(desc(Total_CO2))
co2_cont
## # A tibble: 6 × 2
## Continent Total_CO2
## <chr> <dbl>
## 1 Asia 242.
## 2 Europe 229.
## 3 North America 79.6
## 4 Africa 58.0
## 5 Oceania 53.2
## 6 South America 29.4
##Counting top 5 country with high population
population5 <- population_ds %>% arrange(desc(X1990.Population)) %>% slice(1:5)
population5
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 1 CHN China Beijing Asia 1425887337
## 2 2 IND India New Delhi Asia 1417173173
## 3 3 USA United States Washington, D.C. North America 338289857
## 4 4 IDN Indonesia Jakarta Asia 275501339
## 5 7 BRA Brazil Brasilia South America 215313498
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 1424929781 1393715448 1348191368 1264099069
## 2 1396387127 1322866505 1240613620 1059633675
## 3 335942003 324607776 311182845 282398554
## 4 271857970 259091970 244016173 214072421
## 5 213196304 205188205 196353492 175873720
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 1153704252 982372466 822534450 9706961
## 2 870452165 696828385 557501301 3287590
## 3 248083732 223140018 200328340 9372610
## 4 182159874 148177096 115228394 1904569
## 5 150706446 122288383 96369875 8515767
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 146.8933 1.0000 17.88
## 2 431.0675 1.0068 17.77
## 3 36.0935 1.0038 4.24
## 4 144.6529 1.0064 3.45
## 5 25.2841 1.0046 2.70
##Using geom_bar() where it counts Rows by default
library(ggplot2)
ggplot(population5, aes(x = reorder(Country.Territory, X1990.Population), y = X1990.Population)) +
geom_bar(stat = "identity", fill = "darkblue") +
coord_flip() +
labs(
title = "Most Populous Countries in 1990 (TOP 5)",
x = "Country",
y = "Population"
) +
theme_minimal()
##Using geom_bar() where it counts Rows by default
ggplot(population5, aes(x = X1990.Population, y = Area..km..)) + geom_point(color = "darkblue", size = 3) + geom_smooth(method = "lm", color = "Orange", se = FALSE) +
labs(title = "Population vs Area", x = "Populatar in 1990", y = "Area (km²)") + theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
##Trace: is used to insert debugging code into a function temporarily
without editing the function itself
trace(mean)
mean(c(1, 2, 3, 4, 5))
## trace: mean(c(1, 2, 3, 4, 5))
## [1] 3
mean(mtcars$mpg)
## trace: mean(mtcars$mpg)
## [1] 20.09062
##Recover: execution at the point of error and lets you inspect the environment of each function in the call stack interactively
options(error = recover)
process_values <- function(values) {
results <- c()
for (i in seq_along(values)) {
result <- sqrt(values[[i]])
results <- c(results, result)
}
return(results)
}
my_values <- list(4, 9, -1, 16)
process_values(my_values)
## Warning in sqrt(values[[i]]): NaNs produced
## [1] 2 3 NaN 4
##Sapply/apply: applies remove_outliers() to each numeric column result is assigned back to numeric columns only with the Create Functions of Outliers
remove_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_value
upper <- Q3 + 1.5 * IQR_value
x[x < lower | x > upper] <- NA
return(x)
}
population_ds[sapply(population_ds, is.numeric)] <- lapply( population_ds[sapply(population_ds, is.numeric)], remove_outliers)
summary(population_ds)
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## Rank CCA3 Country.Territory Capital
## Min. : 1.00 Length :234 Length :234 Length :234
## 1st Qu.: 59.25 N.unique :234 N.unique :234 N.unique :234
## Median :117.50 N.blank : 0 N.blank : 0 N.blank : 0
## Mean :117.50 Min.nchar: 3 Min.nchar: 4 Min.nchar: 4
## 3rd Qu.:175.75 Max.nchar: 3 Max.nchar: 32 Max.nchar: 19
## Max. :234.00
##
## Continent X2022.Population X2020.Population X2015.Population
## Length :234 Min. : 510 Min. : 520 Min. : 564
## N.unique : 6 1st Qu.: 326101 1st Qu.: 304670 1st Qu.: 280558
## N.blank : 0 Median : 4030358 Median : 3931390 Median : 3771132
## Min.nchar: 4 Mean : 9985056 Mean : 9517226 Mean : 8630101
## Max.nchar: 13 3rd Qu.:12889576 3rd Qu.:12176349 3rd Qu.:11221301
## Max. :54179306 Max. :51985780 Max. :47119728
## NAs :25 NAs :26 NAs :27
## X2010.Population X2000.Population X1990.Population X1980.Population
## Min. : 596 Min. : 651 Min. : 700 Min. : 733
## 1st Qu.: 271390 1st Qu.: 221537 1st Qu.: 172725 1st Qu.: 146000
## Median : 3650902 Median : 3139954 Median : 2392030 Median : 1944578
## Mean : 8161724 Mean : 6401596 Mean : 5100338 Mean : 4091130
## 3rd Qu.:10660788 3rd Qu.: 8871043 3rd Qu.: 7682565 3rd Qu.: 6192514
## Max. :46572772 Max. :38504431 Max. :27657204 Max. :22125224
## NAs :26 NAs :29 NAs :31 NAs :32
## X1970.Population Area..km.. Density..per.km.. Growth.Rate
## Min. : 752 Min. : 1 Min. : 0.0261 Min. :0.9816
## 1st Qu.: 117891 1st Qu.: 1110 1st Qu.: 30.5126 1st Qu.:1.0018
## Median : 1741286 Median : 46949 Median : 82.0961 Median :1.0079
## Mean : 3581916 Mean : 166204 Mean :127.1829 Mean :1.0096
## 3rd Qu.: 5202918 3rd Qu.: 242563 3rd Qu.:176.3380 3rd Qu.:1.0165
## Max. :21434577 Max. :1030700 Max. :525.2334 Max. :1.0378
## NAs :29 NAs :28 NAs :22 NAs :3
## World.Population.Percentage
## Min. :0.000
## 1st Qu.:0.000
## Median :0.050
## Mean :0.125
## 3rd Qu.:0.160
## Max. :0.680
## NAs :25
##Lapply: Most reliable and always preserve structure for data Frames
num <- sapply(population_ds, is.numeric)
population_ds[num] <- lapply( population_ds[num], remove_outliers)
num
## Rank CCA3
## TRUE FALSE
## Country.Territory Capital
## FALSE FALSE
## Continent X2022.Population
## FALSE TRUE
## X2020.Population X2015.Population
## TRUE TRUE
## X2010.Population X2000.Population
## TRUE TRUE
## X1990.Population X1980.Population
## TRUE TRUE
## X1970.Population Area..km..
## TRUE TRUE
## Density..per.km.. Growth.Rate
## TRUE TRUE
## World.Population.Percentage
## TRUE
##Vapply: Vapply is more useful for element-wise operations
num <- vapply(population_ds, is.numeric, FUN.VALUE = logical(1))
population_ds[num] <- lapply(population_ds[num], remove_outliers)
num
## Rank CCA3
## TRUE FALSE
## Country.Territory Capital
## FALSE FALSE
## Continent X2022.Population
## FALSE TRUE
## X2020.Population X2015.Population
## TRUE TRUE
## X2010.Population X2000.Population
## TRUE TRUE
## X1990.Population X1980.Population
## TRUE TRUE
## X1970.Population Area..km..
## TRUE TRUE
## Density..per.km.. Growth.Rate
## TRUE TRUE
## World.Population.Percentage
## TRUE