Assignment

install.packages(“haven”) library(haven) install.packages(“DBI”) install.packages(“odbc”) install.packages(“RMySQL”) install.packages(“RMariaDB”) library(DBI) library(RMariaDB) install.packages(“tinytex”) tinytex::install_tinytex() install.packages(“writexl”) library(writexl) library(readxl) library(dplyr) library(ggplot2)

##dataset creation

employeedata <- data.frame(
  ID = 1:10,
  Name = c("Anna", "Ritha", "Oliver", "Jackson", "Jado", "Peter", "Keke", "Danny", "Alex", "Claude"),
  Age = c(55, 30, 89, 20, 44, 26, 78, 64, 70, 50),
  Salary = c(200, 500, 650, 800, 550, 400, 200, 700, 900, 300)
)
# Dataset display
print(employeedata)

##    ID    Name Age Salary
## 1   1    Anna  55    200
## 2   2   Ritha  30    500
## 3   3  Oliver  89    650
## 4   4 Jackson  20    800
## 5   5    Jado  44    550
## 6   6   Peter  26    400
## 7   7    Keke  78    200
## 8   8   Danny  64    700
## 9   9    Alex  70    900
## 10 10  Claude  50    300

##dataset reading

library(haven)
write_sav(employeedata, "employeedata.sav")
list.files()

## [1] "Assignment.Rmd"       "Assignment.Rproj"     "CO2_emission.csv"    
## [4] "employeedata.sav"     "loan_datasample.dta"  "loan_dataset.dta"    
## [7] "loan_dataset.xlsx"    "world_population.csv"

read_data <- read_sav("employeedata.sav")
head(read_data)

## # A tibble: 6 × 4
##      ID Name      Age Salary
##   <dbl> <chr>   <dbl>  <dbl>
## 1     1 Anna       55    200
## 2     2 Ritha      30    500
## 3     3 Oliver     89    650
## 4     4 Jackson    20    800
## 5     5 Jado       44    550
## 6     6 Peter      26    400

write_sav(employeedata, "C:/Users/hp/Documents/Assignment/employeedata.sav")

spss_data <- read_sav("C:/Users/hp/Documents/Assignment/employeedata.sav")
head(spss_data)

## # A tibble: 6 × 4
##      ID Name      Age Salary
##   <dbl> <chr>   <dbl>  <dbl>
## 1     1 Anna       55    200
## 2     2 Ritha      30    500
## 3     3 Oliver     89    650
## 4     4 Jackson    20    800
## 5     5 Jado       44    550
## 6     6 Peter      26    400

DTA files creation

set.seed(123)
n <- 100
loan_dataset <- data.frame(
  LoanID = 1001:(1000 + n),
  ClientID = sample(2001:3000, n),
  ClientName = paste("Client", 1:n),
  Gender = sample(c("Male", "Female"), n, replace = TRUE),
  Age = sample(21:65, n, replace = TRUE),
  LoanAmount = round(runif(n, 500000, 20000000), 0),
  InterestRate = round(runif(n, 10, 20), 2),
  LoanTermMonths = sample(c(12, 24, 36, 48, 60), n, replace = TRUE),
  LoanType = sample(c("Personal", "Mortgage", "Business", "Agriculture"), n, replace = TRUE),
  DisbursementDate = sample(seq(as.Date("2023-01-01"), as.Date("2025-01-01"), by="day"), n, replace = TRUE),
  Status = sample(c("Active", "Closed", "Defaulted", "Pending"), n, replace = TRUE),
  CollateralValue = round(runif(n, 0, 30000000), 0)
)
loan_dataset$MonthlyInstallment <- round(
  (loan_dataset$LoanAmount * (1 + loan_dataset$InterestRate/100)) / loan_dataset$LoanTermMonths,
  0
)
head(loan_dataset)

##   LoanID ClientID ClientName Gender Age LoanAmount InterestRate LoanTermMonths
## 1   1001     2415   Client 1 Female  23   10543900        12.41             48
## 2   1002     2463   Client 2 Female  49    3679871        10.58             48
## 3   1003     2179   Client 3   Male  56   12627095        12.37             36
## 4   1004     2526   Client 4   Male  64   19726106        18.90             48
## 5   1005     2195   Client 5 Female  42   13541045        18.12             24
## 6   1006     2938   Client 6 Female  62    8668860        17.48             36
##      LoanType DisbursementDate    Status CollateralValue MonthlyInstallment
## 1 Agriculture       2023-02-15    Closed        22780071             246925
## 2    Personal       2023-03-05    Active         8156706              84775
## 3 Agriculture       2024-06-14 Defaulted         3467474             394141
## 4    Business       2023-08-17   Pending          880098             488632
## 5    Business       2024-09-01    Closed        24205659             666445
## 6    Business       2023-05-09    Closed        14403793             282894

write_dta(loan_dataset, "loan_dataset.dta")
list.files()

## [1] "Assignment.Rmd"       "Assignment.Rproj"     "CO2_emission.csv"    
## [4] "employeedata.sav"     "loan_datasample.dta"  "loan_dataset.dta"    
## [7] "loan_dataset.xlsx"    "world_population.csv"

stata_data <- read_dta("loan_dataset.dta")
head(stata_data)

## # A tibble: 6 × 13
##   LoanID ClientID ClientName Gender   Age LoanAmount InterestRate LoanTermMonths
##    <dbl>    <dbl> <chr>      <chr>  <dbl>      <dbl>        <dbl>          <dbl>
## 1   1001     2415 Client 1   Female    23   10543900         12.4             48
## 2   1002     2463 Client 2   Female    49    3679871         10.6             48
## 3   1003     2179 Client 3   Male      56   12627095         12.4             36
## 4   1004     2526 Client 4   Male      64   19726106         18.9             48
## 5   1005     2195 Client 5   Female    42   13541045         18.1             24
## 6   1006     2938 Client 6   Female    62    8668860         17.5             36
## # ℹ 5 more variables: LoanType <chr>, DisbursementDate <date>, Status <chr>,
## #   CollateralValue <dbl>, MonthlyInstallment <dbl>

write_dta(loan_dataset, "C:/Users/hp/Documents/Assignment/loan_datasample.dta")

##reading dataset from database (mysql)

library(DBI)
library(RMariaDB)

con_my <- dbConnect(
  RMariaDB::MariaDB(),
  dbname = "employee_loan",
  host = "localhost",
  user = "root",
  password = "root",
  port = 3306
)

db_data <- dbGetQuery(con_my, "SELECT * FROM loans")
head(db_data)

##   LoanID ClientID ClientName Gender Age LoanAmount InterestRate LoanTermMonths
## 1   1001     2415   Client 1 Female  23   10543900        12.41             48
## 2   1002     2463   Client 2 Female  49    3679871        10.58             48
## 3   1003     2179   Client 3   Male  56   12627095        12.37             36
## 4   1004     2526   Client 4   Male  64   19726106        18.90             48
## 5   1005     2195   Client 5 Female  42   13541045        18.12             24
## 6   1006     2938   Client 6 Female  62    8668860        17.48             36
##      LoanType
## 1 Agriculture
## 2    Personal
## 3 Agriculture
## 4    Business
## 5    Business
## 6    Business

dbDisconnect(con_my)

##Reading dataset from Excel

library(readxl)
read_data <- read_excel("C:/Users/hp/Documents/Assignment/loan_dataset.xlsx")
head(read_data)

## # A tibble: 6 × 9
##   LoanID ClientID ClientName Gender   Age LoanAmount InterestRate LoanTermMonths
##    <dbl>    <dbl> <chr>      <chr>  <dbl>      <dbl>        <dbl>          <dbl>
## 1   1001     2415 Client 1   Female    23   10543900         12.4             48
## 2   1002     2463 Client 2   Female    49    3679871         10.6             48
## 3   1003     2179 Client 3   Male      56   12627095         12.4             36
## 4   1004     2526 Client 4   Male      64   19726106         18.9             48
## 5   1005     2195 Client 5   Female    42   13541045         18.1             24
## 6   1006     2938 Client 6   Female    62    8668860         17.5             36
## # ℹ 1 more variable: LoanType <chr>

##Calculate Mean of Iris dataset

data("iris")
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

mean(iris$Petal.Length)

## [1] 3.758

mean(iris$Petal.Length[iris$Species=="setosa"])

## [1] 1.462

mean(iris$Petal.Length[iris$Species=="versicolor"])

## [1] 4.26

mean(iris$Petal.Length[iris$Species=="virginica"])

## [1] 5.552

##Dataset exploratory

population_ds <- read.csv("C:/Users/hp/Documents/Assignment/world_population.csv")
co2_data <- read.csv("C:/Users/hp/Documents/Assignment/CO2_emission.csv")
head(population_ds)

##   Rank CCA3 Country.Territory          Capital Continent X2022.Population
## 1   36  AFG       Afghanistan            Kabul      Asia         41128771
## 2  138  ALB           Albania           Tirana    Europe          2842321
## 3   34  DZA           Algeria          Algiers    Africa         44903225
## 4  213  ASM    American Samoa        Pago Pago   Oceania            44273
## 5  203  AND           Andorra Andorra la Vella    Europe            79824
## 6   42  AGO            Angola           Luanda    Africa         35588987
##   X2020.Population X2015.Population X2010.Population X2000.Population
## 1         38972230         33753499         28189672         19542982
## 2          2866849          2882481          2913399          3182021
## 3         43451666         39543154         35856344         30774621
## 4            46189            51368            54849            58230
## 5            77700            71746            71519            66097
## 6         33428485         28127721         23364185         16394062
##   X1990.Population X1980.Population X1970.Population Area..km..
## 1         10694796         12486631         10752971     652230
## 2          3295066          2941651          2324731      28748
## 3         25518074         18739378         13795915    2381741
## 4            47818            32886            27075        199
## 5            53569            35611            19860        468
## 6         11828638          8330047          6029700    1246700
##   Density..per.km.. Growth.Rate World.Population.Percentage
## 1           63.0587      1.0257                        0.52
## 2           98.8702      0.9957                        0.04
## 3           18.8531      1.0164                        0.56
## 4          222.4774      0.9831                        0.00
## 5          170.5641      1.0100                        0.00
## 6           28.5466      1.0315                        0.45

##Dataset exploratory

head(co2_data)

##           Country.Name country_code                     Region
## 1                Aruba          ABW  Latin America & Caribbean
## 2          Afghanistan          AFG                 South Asia
## 3               Angola          AGO         Sub-Saharan Africa
## 4              Albania          ALB      Europe & Central Asia
## 5              Andorra          AND      Europe & Central Asia
## 6 United Arab Emirates          ARE Middle East & North Africa
##                           Indicator.Name      X1990      X1991       X1992
## 1 CO2 emissions (metric tons per capita)         NA         NA          NA
## 2 CO2 emissions (metric tons per capita)  0.1917451  0.1676816  0.09595774
## 3 CO2 emissions (metric tons per capita)  0.5536620  0.5445386  0.54355722
## 4 CO2 emissions (metric tons per capita)  1.8195416  1.2428102  0.68369983
## 5 CO2 emissions (metric tons per capita)  7.5218317  7.2353792  6.96307870
## 6 CO2 emissions (metric tons per capita) 30.1951886 31.7784962 29.08092584
##         X1993       X1994       X1995       X1996       X1997       X1998
## 1          NA          NA          NA          NA          NA          NA
## 2  0.08472111  0.07554583  0.06846796  0.06258803  0.05682662  0.05269086
## 3  0.70898423  0.83680440  0.91214149  1.07216847  1.08663697  1.09182531
## 4  0.63830704  0.64535519  0.60543625  0.61236736  0.46692147  0.57215370
## 5  6.72417752  6.54157891  6.73347949  6.99159455  7.30744115  7.63953851
## 6 29.27567777 30.84933296 31.12501806 30.92802588 30.48633262 29.66358052
##         X1999      X2000       X2001       X2002       X2003       X2004
## 1          NA         NA          NA          NA          NA          NA
## 2  0.04015697  0.0365737  0.03378536  0.04557366  0.05151838  0.04165539
## 3  1.10985966  0.9880774  0.94182891  0.89557767  0.92486944  0.93026295
## 4  0.95535931  1.0262131  1.05549588  1.23237878  1.33898498  1.40405869
## 5  7.92319165  7.9522863  7.72154906  7.56623988  7.24241557  7.34426233
## 6 28.88710798 27.0351591 29.43026994 28.50146173 27.96926982 27.03893822
##         X2005       X2006       X2007      X2008      X2009      X2010
## 1          NA          NA          NA         NA         NA         NA
## 2  0.06041878  0.06658329  0.06531235  0.1284166  0.1718624  0.2436140
## 3  0.81353929  0.82184008  0.81175351  0.8886580  0.9394040  0.9761842
## 4  1.33820940  1.33999574  1.39393137  1.3843112  1.4414936  1.5276237
## 5  7.35378001  6.79054277  6.53104692  6.4393039  6.1566875  6.1571978
## 6 25.38238104 22.93510429 21.37028576 22.0114692 19.8323489 19.0397698
##        X2011      X2012      X2013      X2014      X2015      X2016      X2017
## 1         NA         NA         NA         NA         NA         NA         NA
## 2  0.2965062  0.2592953  0.1856237  0.1462356  0.1728967  0.1497893  0.1316946
## 3  0.9855223  0.9506959  1.0362939  1.0997791  1.1350441  1.0318113  0.8133007
## 4  1.6694232  1.5032405  1.5336300  1.6683374  1.6037751  1.5576644  1.7887861
## 5  5.8508861  5.9446542  5.9428004  5.8071277  6.0261818  6.0806003  6.1041339
## 6 18.5094574 19.2078011 20.0556476 20.0516980 21.0776420 21.4806686 20.7690223
##        X2018      X2019    X2019.1
## 1         NA         NA         NA
## 2  0.1632953  0.1598244  0.1598244
## 3  0.7776749  0.7921371  0.7921371
## 4  1.7827389  1.6922483  1.6922483
## 5  6.3629754  6.4812174  6.4812174
## 6 18.3906781 19.3295633 19.3295633

##Merging dataset by using %>%

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

merge_data <- population_ds %>% 
  select(Country.Territory, Continent, X2022.Population) %>%
  inner_join(
    co2_data %>% select(Country.Name, co2_2019 = `X2019`), 
    by = c("Country.Territory" = "Country.Name")
  )
head(merge_data)

##   Country.Territory Continent X2022.Population  co2_2019
## 1       Afghanistan      Asia         41128771 0.1598244
## 2           Albania    Europe          2842321 1.6922483
## 3           Algeria    Africa         44903225 3.9776505
## 4    American Samoa   Oceania            44273        NA
## 5           Andorra    Europe            79824 6.4812174
## 6            Angola    Africa         35588987 0.7921371

##Sum the 2019 CO2 emissions dataSets per continent by using Group By and marge dataset by using %>%

co2_cont <- merge_data %>% group_by(Continent) %>% 
  summarise(Total_CO2 = sum(co2_2019, na.rm = TRUE)) %>% arrange(desc(Total_CO2))
co2_cont

## # A tibble: 6 × 2
##   Continent     Total_CO2
##   <chr>             <dbl>
## 1 Asia              242. 
## 2 Europe            229. 
## 3 North America      79.6
## 4 Africa             58.0
## 5 Oceania            53.2
## 6 South America      29.4

##Counting top 5 country with high population

population5 <- population_ds %>% arrange(desc(X1990.Population)) %>% slice(1:5)
population5

##   Rank CCA3 Country.Territory          Capital     Continent X2022.Population
## 1    1  CHN             China          Beijing          Asia       1425887337
## 2    2  IND             India        New Delhi          Asia       1417173173
## 3    3  USA     United States Washington, D.C. North America        338289857
## 4    4  IDN         Indonesia          Jakarta          Asia        275501339
## 5    7  BRA            Brazil         Brasilia South America        215313498
##   X2020.Population X2015.Population X2010.Population X2000.Population
## 1       1424929781       1393715448       1348191368       1264099069
## 2       1396387127       1322866505       1240613620       1059633675
## 3        335942003        324607776        311182845        282398554
## 4        271857970        259091970        244016173        214072421
## 5        213196304        205188205        196353492        175873720
##   X1990.Population X1980.Population X1970.Population Area..km..
## 1       1153704252        982372466        822534450    9706961
## 2        870452165        696828385        557501301    3287590
## 3        248083732        223140018        200328340    9372610
## 4        182159874        148177096        115228394    1904569
## 5        150706446        122288383         96369875    8515767
##   Density..per.km.. Growth.Rate World.Population.Percentage
## 1          146.8933      1.0000                       17.88
## 2          431.0675      1.0068                       17.77
## 3           36.0935      1.0038                        4.24
## 4          144.6529      1.0064                        3.45
## 5           25.2841      1.0046                        2.70

##Using geom_bar() where it counts Rows by default

library(ggplot2)
ggplot(population5, aes(x = reorder(Country.Territory, X1990.Population), y = X1990.Population)) +
  geom_bar(stat = "identity", fill = "darkblue") + 
  coord_flip() + 
  labs(
    title = "Most Populous Countries in 1990 (TOP 5)",
    x = "Country", 
    y = "Population"
  ) + 
  theme_minimal()

##Using geom_bar() where it counts Rows by default

ggplot(population5, aes(x = X1990.Population, y = Area..km..)) + geom_point(color = "darkblue", size = 3) + geom_smooth(method = "lm", color = "Orange", se = FALSE) +
  labs(title = "Population vs Area", x = "Populatar in 1990", y = "Area (km²)") + theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

##Trace: is used to insert debugging code into a function temporarily without editing the function itself

trace(mean)
mean(c(1, 2, 3, 4, 5))

## trace: mean(c(1, 2, 3, 4, 5))

## [1] 3

mean(mtcars$mpg)

## trace: mean(mtcars$mpg)

## [1] 20.09062

##Recover: execution at the point of error and lets you inspect the environment of each function in the call stack interactively

options(error = recover)
process_values <- function(values) {
  results <- c()
  for (i in seq_along(values)) {
    result    <- sqrt(values[[i]])
    results   <- c(results, result)
  }
  return(results)
}
my_values <- list(4, 9, -1, 16)
process_values(my_values)

## Warning in sqrt(values[[i]]): NaNs produced

## [1]   2   3 NaN   4

##Sapply/apply: applies remove_outliers() to each numeric column result is assigned back to numeric columns only with the Create Functions of Outliers

remove_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1

  lower <- Q1 - 1.5 * IQR_value
  upper <- Q3 + 1.5 * IQR_value

  x[x < lower | x > upper] <- NA
  return(x)
}

population_ds[sapply(population_ds, is.numeric)] <- lapply( population_ds[sapply(population_ds, is.numeric)], remove_outliers)
summary(population_ds)

## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean

##       Rank               CCA3     Country.Territory      Capital   
##  Min.   :  1.00   Length   :234   Length   :234     Length   :234  
##  1st Qu.: 59.25   N.unique :234   N.unique :234     N.unique :234  
##  Median :117.50   N.blank  :  0   N.blank  :  0     N.blank  :  0  
##  Mean   :117.50   Min.nchar:  3   Min.nchar:  4     Min.nchar:  4  
##  3rd Qu.:175.75   Max.nchar:  3   Max.nchar: 32     Max.nchar: 19  
##  Max.   :234.00                                                    
##                                                                    
##      Continent   X2022.Population   X2020.Population   X2015.Population  
##  Length   :234   Min.   :     510   Min.   :     520   Min.   :     564  
##  N.unique :  6   1st Qu.:  326101   1st Qu.:  304670   1st Qu.:  280558  
##  N.blank  :  0   Median : 4030358   Median : 3931390   Median : 3771132  
##  Min.nchar:  4   Mean   : 9985056   Mean   : 9517226   Mean   : 8630101  
##  Max.nchar: 13   3rd Qu.:12889576   3rd Qu.:12176349   3rd Qu.:11221301  
##                  Max.   :54179306   Max.   :51985780   Max.   :47119728  
##                  NAs    :25         NAs    :26         NAs    :27        
##  X2010.Population   X2000.Population   X1990.Population   X1980.Population  
##  Min.   :     596   Min.   :     651   Min.   :     700   Min.   :     733  
##  1st Qu.:  271390   1st Qu.:  221537   1st Qu.:  172725   1st Qu.:  146000  
##  Median : 3650902   Median : 3139954   Median : 2392030   Median : 1944578  
##  Mean   : 8161724   Mean   : 6401596   Mean   : 5100338   Mean   : 4091130  
##  3rd Qu.:10660788   3rd Qu.: 8871043   3rd Qu.: 7682565   3rd Qu.: 6192514  
##  Max.   :46572772   Max.   :38504431   Max.   :27657204   Max.   :22125224  
##  NAs    :26         NAs    :29         NAs    :31         NAs    :32        
##  X1970.Population     Area..km..      Density..per.km..   Growth.Rate    
##  Min.   :     752   Min.   :      1   Min.   :  0.0261   Min.   :0.9816  
##  1st Qu.:  117891   1st Qu.:   1110   1st Qu.: 30.5126   1st Qu.:1.0018  
##  Median : 1741286   Median :  46949   Median : 82.0961   Median :1.0079  
##  Mean   : 3581916   Mean   : 166204   Mean   :127.1829   Mean   :1.0096  
##  3rd Qu.: 5202918   3rd Qu.: 242563   3rd Qu.:176.3380   3rd Qu.:1.0165  
##  Max.   :21434577   Max.   :1030700   Max.   :525.2334   Max.   :1.0378  
##  NAs    :29         NAs    :28        NAs    :22         NAs    :3       
##  World.Population.Percentage
##  Min.   :0.000              
##  1st Qu.:0.000              
##  Median :0.050              
##  Mean   :0.125              
##  3rd Qu.:0.160              
##  Max.   :0.680              
##  NAs    :25

##Lapply: Most reliable and always preserve structure for data Frames

num <- sapply(population_ds, is.numeric)
population_ds[num] <- lapply( population_ds[num], remove_outliers)
num

##                        Rank                        CCA3 
##                        TRUE                       FALSE 
##           Country.Territory                     Capital 
##                       FALSE                       FALSE 
##                   Continent            X2022.Population 
##                       FALSE                        TRUE 
##            X2020.Population            X2015.Population 
##                        TRUE                        TRUE 
##            X2010.Population            X2000.Population 
##                        TRUE                        TRUE 
##            X1990.Population            X1980.Population 
##                        TRUE                        TRUE 
##            X1970.Population                  Area..km.. 
##                        TRUE                        TRUE 
##           Density..per.km..                 Growth.Rate 
##                        TRUE                        TRUE 
## World.Population.Percentage 
##                        TRUE

##Vapply: Vapply is more useful for element-wise operations

num <- vapply(population_ds, is.numeric, FUN.VALUE = logical(1))
population_ds[num] <- lapply(population_ds[num], remove_outliers)
num

##                        Rank                        CCA3 
##                        TRUE                       FALSE 
##           Country.Territory                     Capital 
##                       FALSE                       FALSE 
##                   Continent            X2022.Population 
##                       FALSE                        TRUE 
##            X2020.Population            X2015.Population 
##                        TRUE                        TRUE 
##            X2010.Population            X2000.Population 
##                        TRUE                        TRUE 
##            X1990.Population            X1980.Population 
##                        TRUE                        TRUE 
##            X1970.Population                  Area..km.. 
##                        TRUE                        TRUE 
##           Density..per.km..                 Growth.Rate 
##                        TRUE                        TRUE 
## World.Population.Percentage 
##                        TRUE

Assignment

Medard Gasarabwe

2026-05-24

DTA files creation