Haven refers to read and write data files from SPSS (.sav), Stata (.dta), and SAS (.sas7bdat), and is part of the tidyverse ecosystem.

install.packages(“haven”) ## Create a sample dataset and View Data Set

library(haven)
# Create a sample dataset
datasample <- data.frame(
  ID = 1:10,
  Name = c("Jean", "Alice", "Bob", "David", "Emma", "Frank", "Grace", "Helen", "Ian", "Jane"),
  Age = c(25, 30, 28, 35, 22, 40, 27, 33, 29, 31),
  Salary = c(500, 600, 550, 700, 450, 800, 520, 680, 590, 610)
)
# View data
print(datasample)
##    ID  Name Age Salary
## 1   1  Jean  25    500
## 2   2 Alice  30    600
## 3   3   Bob  28    550
## 4   4 David  35    700
## 5   5  Emma  22    450
## 6   6 Frank  40    800
## 7   7 Grace  27    520
## 8   8 Helen  33    680
## 9   9   Ian  29    590
## 10 10  Jane  31    610

View The Data and read Data

write_sav(datasample, "datasample.sav")
list.files()
## [1] "datasample.sav"                     "loan_datasample.dta"               
## [3] "R-Assignment.html"                  "R-Assignment.pdf"                  
## [5] "R-Assignment.Rmd"                   "R assignment I for Import Data.Rmd"
## [7] "R Assignment.Rmd"                   "rsconnect"
"datasample.sav"
## [1] "datasample.sav"
data_loaded <- read_sav("datasample.sav")

head(data_loaded)
## # A tibble: 6 × 4
##      ID Name    Age Salary
##   <dbl> <chr> <dbl>  <dbl>
## 1     1 Jean     25    500
## 2     2 Alice    30    600
## 3     3 Bob      28    550
## 4     4 David    35    700
## 5     5 Emma     22    450
## 6     6 Frank    40    800
write_sav(datasample, "C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/datasample.sav")


spss_data <- read_sav("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/datasample.sav")
head(spss_data)
## # A tibble: 6 × 4
##      ID Name    Age Salary
##   <dbl> <chr> <dbl>  <dbl>
## 1     1 Jean     25    500
## 2     2 Alice    30    600
## 3     3 Bob      28    550
## 4     4 David    35    700
## 5     5 Emma     22    450
## 6     6 Frank    40    800

Install Packages of DATABASE I used Postgresql

install.packages(“DBI”) install.packages(“odbc”) install.packages(“RMySQL”) install.packages(“RPostgres”)

1.6 Read Excel File and Installed tinytex is a lightweight,portable LaTeX distribution designed specifically for R users, maintained by Yihui Xie (creator of R Markdown).

install.packages(“tinytex”) tinytex::install_tinytex()

library(readxl)
datainexcel <- read_excel("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/OpeningDate.xlsx")
## New names:
## • `` -> `...5`
## • `` -> `...7`
head(datainexcel)
## # A tibble: 6 × 12
##   `Open Date`         `Date Birth`        Gender Marital ...5  Status ...7 
##   <dttm>              <dttm>              <chr>  <chr>   <lgl> <chr>  <lgl>
## 1 2026-01-12 00:00:00 1999-09-09 00:00:00 Male   Single  NA    Active NA   
## 2 2026-02-03 00:00:00 1990-01-01 00:00:00 Male   Married NA    Active NA   
## 3 2026-02-03 00:00:00 1987-12-22 00:00:00 Female Married NA    Active NA   
## 4 2026-02-03 00:00:00 1990-12-28 00:00:00 Male   Single  NA    Active NA   
## 5 2026-02-03 00:00:00 1987-06-03 00:00:00 Female Married NA    Active NA   
## 6 2026-02-03 00:00:00 2000-01-01 00:00:00 Female Married NA    Active NA   
## # ℹ 5 more variables: Education <chr>, Post <chr>, Nature <chr>, Salary <dbl>,
## #   `Valid Date` <dttm>

Implement the Iris Data with the Mean of different Loacation

data("iris")
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
mean(iris$Petal.Length)
## [1] 3.758
mean(iris$Petal.Length[iris$Species=="setosa"])
## [1] 1.462
mean(iris$Petal.Length[iris$Species=="versicolor"])
## [1] 4.26
mean(iris$Petal.Length[iris$Species=="virginica"])
## [1] 5.552

Merging the data Sets

#Exploratory Data Analysis # Load CSV files for world_population and CO2_emission DataSets from Kaggle website

population_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/Assignment/world_population.csv")
co2_data <- read.csv("C:/Users/jeandedieu.rwabukang/Documents/R Assignment for Import/Assignment/CO2_emission.csv")

# View Data sets
head(population_data)
##   Rank CCA3 Country.Territory          Capital Continent X2022.Population
## 1   36  AFG       Afghanistan            Kabul      Asia         41128771
## 2  138  ALB           Albania           Tirana    Europe          2842321
## 3   34  DZA           Algeria          Algiers    Africa         44903225
## 4  213  ASM    American Samoa        Pago Pago   Oceania            44273
## 5  203  AND           Andorra Andorra la Vella    Europe            79824
## 6   42  AGO            Angola           Luanda    Africa         35588987
##   X2020.Population X2015.Population X2010.Population X2000.Population
## 1         38972230         33753499         28189672         19542982
## 2          2866849          2882481          2913399          3182021
## 3         43451666         39543154         35856344         30774621
## 4            46189            51368            54849            58230
## 5            77700            71746            71519            66097
## 6         33428485         28127721         23364185         16394062
##   X1990.Population X1980.Population X1970.Population Area..km..
## 1         10694796         12486631         10752971     652230
## 2          3295066          2941651          2324731      28748
## 3         25518074         18739378         13795915    2381741
## 4            47818            32886            27075        199
## 5            53569            35611            19860        468
## 6         11828638          8330047          6029700    1246700
##   Density..per.km.. Growth.Rate World.Population.Percentage
## 1           63.0587      1.0257                        0.52
## 2           98.8702      0.9957                        0.04
## 3           18.8531      1.0164                        0.56
## 4          222.4774      0.9831                        0.00
## 5          170.5641      1.0100                        0.00
## 6           28.5466      1.0315                        0.45
head(co2_data)
##           Country.Name country_code                     Region
## 1                Aruba          ABW  Latin America & Caribbean
## 2          Afghanistan          AFG                 South Asia
## 3               Angola          AGO         Sub-Saharan Africa
## 4              Albania          ALB      Europe & Central Asia
## 5              Andorra          AND      Europe & Central Asia
## 6 United Arab Emirates          ARE Middle East & North Africa
##                           Indicator.Name      X1990      X1991       X1992
## 1 CO2 emissions (metric tons per capita)         NA         NA          NA
## 2 CO2 emissions (metric tons per capita)  0.1917451  0.1676816  0.09595774
## 3 CO2 emissions (metric tons per capita)  0.5536620  0.5445386  0.54355722
## 4 CO2 emissions (metric tons per capita)  1.8195416  1.2428102  0.68369983
## 5 CO2 emissions (metric tons per capita)  7.5218317  7.2353792  6.96307870
## 6 CO2 emissions (metric tons per capita) 30.1951886 31.7784962 29.08092584
##         X1993       X1994       X1995       X1996       X1997       X1998
## 1          NA          NA          NA          NA          NA          NA
## 2  0.08472111  0.07554583  0.06846796  0.06258803  0.05682662  0.05269086
## 3  0.70898423  0.83680440  0.91214149  1.07216847  1.08663697  1.09182531
## 4  0.63830704  0.64535519  0.60543625  0.61236736  0.46692147  0.57215370
## 5  6.72417752  6.54157891  6.73347949  6.99159455  7.30744115  7.63953851
## 6 29.27567777 30.84933296 31.12501806 30.92802588 30.48633262 29.66358052
##         X1999      X2000       X2001       X2002       X2003       X2004
## 1          NA         NA          NA          NA          NA          NA
## 2  0.04015697  0.0365737  0.03378536  0.04557366  0.05151838  0.04165539
## 3  1.10985966  0.9880774  0.94182891  0.89557767  0.92486944  0.93026295
## 4  0.95535931  1.0262131  1.05549588  1.23237878  1.33898498  1.40405869
## 5  7.92319165  7.9522863  7.72154906  7.56623988  7.24241557  7.34426233
## 6 28.88710798 27.0351591 29.43026994 28.50146173 27.96926982 27.03893822
##         X2005       X2006       X2007      X2008      X2009      X2010
## 1          NA          NA          NA         NA         NA         NA
## 2  0.06041878  0.06658329  0.06531235  0.1284166  0.1718624  0.2436140
## 3  0.81353929  0.82184008  0.81175351  0.8886580  0.9394040  0.9761842
## 4  1.33820940  1.33999574  1.39393137  1.3843112  1.4414936  1.5276237
## 5  7.35378001  6.79054277  6.53104692  6.4393039  6.1566875  6.1571978
## 6 25.38238104 22.93510429 21.37028576 22.0114692 19.8323489 19.0397698
##        X2011      X2012      X2013      X2014      X2015      X2016      X2017
## 1         NA         NA         NA         NA         NA         NA         NA
## 2  0.2965062  0.2592953  0.1856237  0.1462356  0.1728967  0.1497893  0.1316946
## 3  0.9855223  0.9506959  1.0362939  1.0997791  1.1350441  1.0318113  0.8133007
## 4  1.6694232  1.5032405  1.5336300  1.6683374  1.6037751  1.5576644  1.7887861
## 5  5.8508861  5.9446542  5.9428004  5.8071277  6.0261818  6.0806003  6.1041339
## 6 18.5094574 19.2078011 20.0556476 20.0516980 21.0776420 21.4806686 20.7690223
##        X2018      X2019    X2019.1
## 1         NA         NA         NA
## 2  0.1632953  0.1598244  0.1598244
## 3  0.7776749  0.7921371  0.7921371
## 4  1.7827389  1.6922483  1.6922483
## 5  6.3629754  6.4812174  6.4812174
## 6 18.3906781 19.3295633 19.3295633
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Merge the datasets By using %>% and Inner Join where we combine The Countries

merged_data2 <- population_data %>% 
  select(Country.Territory, Continent, X2022.Population) %>%
  inner_join(
    co2_data %>%
      select(Country.Name, co2_2019=`X2019`),
    by = c("Country.Territory" = "Country.Name")
  )
head(merged_data2)
##   Country.Territory Continent X2022.Population  co2_2019
## 1       Afghanistan      Asia         41128771 0.1598244
## 2           Albania    Europe          2842321 1.6922483
## 3           Algeria    Africa         44903225 3.9776505
## 4    American Samoa   Oceania            44273        NA
## 5           Andorra    Europe            79824 6.4812174
## 6            Angola    Africa         35588987 0.7921371

#Sum 2019 CO2 emissions data Sets by continent by using Group By and %>%

continent_co2 <- merged_data2 %>%
  group_by(Continent) %>%
  summarise(
    Total_CO2 = sum(co2_2019, na.rm = TRUE)
  ) %>%
  arrange(desc(Total_CO2))


continent_co2
## # A tibble: 6 × 2
##   Continent     Total_CO2
##   <chr>             <dbl>
## 1 Asia              242. 
## 2 Europe            229. 
## 3 North America      79.6
## 4 Africa             58.0
## 5 Oceania            53.2
## 6 South America      29.4

##Using Ggplot is most popular data visualization package, built on Leland Wilkinson’s Grammar of Graphics install.packages(“ggplot2”)

Load the Library of ggplot2

library(ggplot2) # Using geom_bar() where it counts Rows by default

library(ggplot2)
top10_Population <- population_data %>%
  arrange(desc(X2022.Population)) %>%
  slice(1:10)
top10_Population
##    Rank CCA3 Country.Territory          Capital     Continent X2022.Population
## 1     1  CHN             China          Beijing          Asia       1425887337
## 2     2  IND             India        New Delhi          Asia       1417173173
## 3     3  USA     United States Washington, D.C. North America        338289857
## 4     4  IDN         Indonesia          Jakarta          Asia        275501339
## 5     5  PAK          Pakistan        Islamabad          Asia        235824862
## 6     6  NGA           Nigeria            Abuja        Africa        218541212
## 7     7  BRA            Brazil         Brasilia South America        215313498
## 8     8  BGD        Bangladesh            Dhaka          Asia        171186372
## 9     9  RUS            Russia           Moscow        Europe        144713314
## 10   10  MEX            Mexico      Mexico City North America        127504125
##    X2020.Population X2015.Population X2010.Population X2000.Population
## 1        1424929781       1393715448       1348191368       1264099069
## 2        1396387127       1322866505       1240613620       1059633675
## 3         335942003        324607776        311182845        282398554
## 4         271857970        259091970        244016173        214072421
## 5         227196741        210969298        194454498        154369924
## 6         208327405        183995785        160952853        122851984
## 7         213196304        205188205        196353492        175873720
## 8         167420951        157830000        148391139        129193327
## 9         145617329        144668389        143242599        146844839
## 10        125998302        120149897        112532401         97873442
##    X1990.Population X1980.Population X1970.Population Area..km..
## 1        1153704252        982372466        822534450    9706961
## 2         870452165        696828385        557501301    3287590
## 3         248083732        223140018        200328340    9372610
## 4         182159874        148177096        115228394    1904569
## 5         115414069         80624057         59290872     881912
## 6          95214257         72951439         55569264     923768
## 7         150706446        122288383         96369875    8515767
## 8         107147651         83929765         67541860     147570
## 9         148005704        138257420        130093010   17098242
## 10         81720428         67705186         50289306    1964375
##    Density..per.km.. Growth.Rate World.Population.Percentage
## 1           146.8933      1.0000                       17.88
## 2           431.0675      1.0068                       17.77
## 3            36.0935      1.0038                        4.24
## 4           144.6529      1.0064                        3.45
## 5           267.4018      1.0191                        2.96
## 6           236.5759      1.0241                        2.74
## 7            25.2841      1.0046                        2.70
## 8          1160.0350      1.0108                        2.15
## 9             8.4636      0.9973                        1.81
## 10           64.9082      1.0063                        1.60
ggplot(top10_Population,
       aes(x = reorder(Country.Territory, X2022.Population),
           y = X2022.Population)) +

  geom_bar(stat = "identity", fill = "steelblue") +

  coord_flip() +

  labs(
    title = "Top 10 Most Populous Countries in 2022",
    x = "Country",
    y = "Population"
  ) +

  theme_minimal()

# Using geom_smooth() with Different Colors where it adds a smoothed trend line to a plot. Almost always paired with geom_point(). here on the grap they are a Population and Area it Points on it

ggplot(top10_Population,
       aes(x = X2022.Population,
           y = Area..km..)) +

  geom_point(color = "blue", size = 3) +

  geom_smooth(method = "lm", color = "Orange", se = FALSE) +

  labs(
    title = "Population vs Area",
    x = "Population 2022",
    y = "Area (km²)"
  ) +

  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

## 7. How to use trace() and recover() trace() and recover() in R trace() is used to insert debugging code into a function temporarily without editing the function itself. It lets you monitor when a function is called, what arguments it receives, and what happens inside it. {r Basic syntax for trace()} # add tracing to a function #trace(what = function_name, tracer = code_to_run)

trace()

# trace mean() to see every time it is called
trace(mean)

# now call mean
mean(c(1, 2, 3, 4, 5))
## trace: mean(c(1, 2, 3, 4, 5))
## [1] 3
mean(mtcars$mpg)
## trace: mean(mtcars$mpg)
## [1] 20.09062

recover() — interactive debugging

recover() pauses execution at the point of error and lets you inspect the environment of each function in the call stack interactively.

options(error = recover)

process_values <- function(values) {
  results <- c()
  for (i in seq_along(values)) {
    result    <- sqrt(values[[i]])
    results   <- c(results, result)
  }
  return(results)
}

# list with a bad value
my_values <- list(4, 9, -1, 16)

process_values(my_values)
## Warning in sqrt(values[[i]]): NaNs produced
## [1]   2   3 NaN   4

#These functions : sapply(), lapply(), vapply(), mapply() are all apply-family functions in R. They replace loops and are used to apply a function over data structures like vectors, lists, and data frames.

Use Sapplysapply(population_data, is.numeric) returns TRUE/FALSE for each column population_data selects only numeric columns

#lapply applies remove_outliers() to each numeric column result is assigned back to numeric columns only with the Create Functions of Outliers

remove_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1

  lower <- Q1 - 1.5 * IQR_value
  upper <- Q3 + 1.5 * IQR_value

  x[x < lower | x > upper] <- NA
  return(x)
}

population_data[sapply(population_data, is.numeric)] <-
  lapply(
    population_data[sapply(population_data, is.numeric)],
    remove_outliers
  )

summary(population_data)
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
## trace: mean
##       Rank               CCA3     Country.Territory      Capital   
##  Min.   :  1.00   Length   :234   Length   :234     Length   :234  
##  1st Qu.: 59.25   N.unique :234   N.unique :234     N.unique :234  
##  Median :117.50   N.blank  :  0   N.blank  :  0     N.blank  :  0  
##  Mean   :117.50   Min.nchar:  3   Min.nchar:  4     Min.nchar:  4  
##  3rd Qu.:175.75   Max.nchar:  3   Max.nchar: 32     Max.nchar: 19  
##  Max.   :234.00                                                    
##                                                                    
##      Continent   X2022.Population   X2020.Population   X2015.Population  
##  Length   :234   Min.   :     510   Min.   :     520   Min.   :     564  
##  N.unique :  6   1st Qu.:  326101   1st Qu.:  304670   1st Qu.:  280558  
##  N.blank  :  0   Median : 4030358   Median : 3931390   Median : 3771132  
##  Min.nchar:  4   Mean   : 9985056   Mean   : 9517226   Mean   : 8630101  
##  Max.nchar: 13   3rd Qu.:12889576   3rd Qu.:12176349   3rd Qu.:11221301  
##                  Max.   :54179306   Max.   :51985780   Max.   :47119728  
##                  NAs    :25         NAs    :26         NAs    :27        
##  X2010.Population   X2000.Population   X1990.Population   X1980.Population  
##  Min.   :     596   Min.   :     651   Min.   :     700   Min.   :     733  
##  1st Qu.:  271390   1st Qu.:  221537   1st Qu.:  172725   1st Qu.:  146000  
##  Median : 3650902   Median : 3139954   Median : 2392030   Median : 1944578  
##  Mean   : 8161724   Mean   : 6401596   Mean   : 5100338   Mean   : 4091130  
##  3rd Qu.:10660788   3rd Qu.: 8871043   3rd Qu.: 7682565   3rd Qu.: 6192514  
##  Max.   :46572772   Max.   :38504431   Max.   :27657204   Max.   :22125224  
##  NAs    :26         NAs    :29         NAs    :31         NAs    :32        
##  X1970.Population     Area..km..      Density..per.km..   Growth.Rate    
##  Min.   :     752   Min.   :      1   Min.   :  0.0261   Min.   :0.9816  
##  1st Qu.:  117891   1st Qu.:   1110   1st Qu.: 30.5126   1st Qu.:1.0018  
##  Median : 1741286   Median :  46949   Median : 82.0961   Median :1.0079  
##  Mean   : 3581916   Mean   : 166204   Mean   :127.1829   Mean   :1.0096  
##  3rd Qu.: 5202918   3rd Qu.: 242563   3rd Qu.:176.3380   3rd Qu.:1.0165  
##  Max.   :21434577   Max.   :1030700   Max.   :525.2334   Max.   :1.0378  
##  NAs    :29         NAs    :28        NAs    :22         NAs    :3       
##  World.Population.Percentage
##  Min.   :0.000              
##  1st Qu.:0.000              
##  Median :0.050              
##  Mean   :0.125              
##  3rd Qu.:0.160              
##  Max.   :0.680              
##  NAs    :25

Lapply always preserve Structure and Most reliable for data Frames

num_cols <- sapply(population_data, is.numeric)

population_data[num_cols] <- lapply(
  population_data[num_cols],
  remove_outliers
)
num_cols
##                        Rank                        CCA3 
##                        TRUE                       FALSE 
##           Country.Territory                     Capital 
##                       FALSE                       FALSE 
##                   Continent            X2022.Population 
##                       FALSE                        TRUE 
##            X2020.Population            X2015.Population 
##                        TRUE                        TRUE 
##            X2010.Population            X2000.Population 
##                        TRUE                        TRUE 
##            X1990.Population            X1980.Population 
##                        TRUE                        TRUE 
##            X1970.Population                  Area..km.. 
##                        TRUE                        TRUE 
##           Density..per.km..                 Growth.Rate 
##                        TRUE                        TRUE 
## World.Population.Percentage 
##                        TRUE

Vapply is more useful for element-wise operations where output shape is fixed and is NOT commonly used for column-wise transformations

num_cols <- vapply(population_data, is.numeric, FUN.VALUE = logical(1))
population_data[num_cols] <- lapply(
  population_data[num_cols],
  remove_outliers
)