Assignment 1: Import from different data sources

Import from excel

library('readxl')
excel_data <- read_excel('students.xlsx')
head(excel_data)
## # A tibble: 4 × 4
##   Names   Algorithm  Math `R Programming`
##   <chr>       <dbl> <dbl>           <dbl>
## 1 Yves           16    17              18
## 2 Anitha         14    15              16
## 3 Mugisha        15    16              17
## 4 Uwase          17    16              15

To import from excel we use package called readxl and we first install it if we dont have it then import it then use read_excel and the file path

Data statistical packages

For import data from statistical software we use haven.

Import from SAS

library('haven')
data <- read_sas('accidents.sas7bdat')
head(data)
## # A tibble: 6 × 4
##      agecat    gender     accid       pop
##       <dbl>     <dbl>     <dbl>     <dbl>
## 1 5.30e-315 5.30e-315 5.38e-315 5.39e-315
## 2 5.30e-315 5.30e-315 5.38e-315 5.39e-315
## 3 5.31e-315 5.30e-315 5.38e-315 5.39e-315
## 4 5.30e-315 0         5.38e-315 5.39e-315
## 5 5.30e-315 0         5.38e-315 5.39e-315
## 6 5.31e-315 0         5.38e-315 5.39e-315

Import from SPSS

library('haven')
data <- read_sav('accidents.sav')
head(data)
## # A tibble: 6 × 4
##   agecat       gender     accid    pop
##   <dbl+lbl>    <dbl+lbl>  <dbl>  <dbl>
## 1 1 [Under 21] 1 [Female] 57997 198522
## 2 2 [21-25]    1 [Female] 57113 203200
## 3 3 [26-30]    1 [Female] 54123 200744
## 4 1 [Under 21] 0 [Male]   63936 187791
## 5 2 [21-25]    0 [Male]   64835 195714
## 6 3 [26-30]    0 [Male]   66804 208239

Import from Stata

library('haven')
data <- read_dta('accidents.dta')
head(data)
## # A tibble: 6 × 4
##   agecat       gender     accid    pop
##   <dbl+lbl>    <dbl+lbl>  <dbl>  <dbl>
## 1 1 [Under 21] 1 [Female] 57997 198522
## 2 2 [21-25]    1 [Female] 57113 203200
## 3 3 [26-30]    1 [Female] 54123 200744
## 4 1 [Under 21] 0 [Male]   63936 187791
## 5 2 [21-25]    0 [Male]   64835 195714
## 6 3 [26-30]    0 [Male]   66804 208239

Data from Text files

Import from ASCII

data <- read.table('D:/R/students.txt', sep=',', header=TRUE)
head(data)
##     Names Algorithm Math R.Programming
## 1    Yves        16   17            18
## 2  Anitha        14   15            16
## 3 Mugisha        15   16            17
## 4   Uwase        17   16            15

Here we import data from the text file .txt which are rows separated by new line and column by comma

Import from XML

library(xml2)
doc <- read_xml('students.xml')
xml_structure(doc)
## <students>
##   <student>
##     <name>
##       {text}
##     <algorithm>
##       {text}
##     <math>
##       {text}
##     <r_programming>
##       {text}
##   <student>
##     <name>
##       {text}
##     <algorithm>
##       {text}
##     <math>
##       {text}
##     <r_programming>
##       {text}
##   <student>
##     <name>
##       {text}
##     <algorithm>
##       {text}
##     <math>
##       {text}
##     <r_programming>
##       {text}
##   <student>
##     <name>
##       {text}
##     <algorithm>
##       {text}
##     <math>
##       {text}
##     <r_programming>
##       {text}
col_names <- xml_text(xml_find_all(doc, './/name'))
col_math <- xml_text(xml_find_all(doc, './/math'))
col_r <- xml_text(xml_find_all(doc, './/r_programming'))
col_algo <- xml_text(xml_find_all(doc, './/algorithm'))

data <- data.frame(name=col_names, math=col_math, r_progamming=col_r, algorithm=col_algo)
head(data)
##      name math r_progamming algorithm
## 1    Yves   17           18        16
## 2  Anitha   15           16        14
## 3 Mugisha   16           17        15
## 4   Uwase   16           15        17

Import data from xml we use xml2 and we just use node names and xpath

Import from webscraping

library(rvest)
page <- read_html('https://igihe.com/index.php')
news_data <- html_elements(page, '.homenews-title')
news_titles <- html_text(html_elements(news_data, 'a'))
news_links <- html_attr(html_elements(news_data, 'a'), 'href')
data <- data.frame(title=news_titles, link=news_links)
head(data)
##                                                                                                    title
## 1                        Miliyoni 300 Frw zigiye kwifashishwa mu kubungabunga ibirwa bibiri by’i Rutsiro
## 2                                    Ikitakwishe kiragukomeza- Shakira ku itandukana rye na Gerard Piqué
## 3                                         Ubushakashatsi bwagaragaje inshuro umuntu agomba koga mu mutwe
## 4 Bigogwe: Abakozi ba Access to Finance Rwanda bibutse abishwe muri Jenoside,  igabira inka abayirokotse
## 5                                                  Kuki abantu bambara imyenda y’umukara mu gushyingura?
## 6                                             Netanyahu ahangayikishijwe n’abashaka kwica Perezida Trump
##                                                                                                      link
## 1 amakuru/u-rwanda/article/miliyoni-300frw-zigiye-kwifashishwa-mu-kubungabunga-ibirwa-bibiri-by-i-rutsiro
## 2                      imikino/article/ikitakwishe-kiragukomeza-shakira-ku-itandukana-rye-na-gerard-pique
## 3                          ubuzima/article/ubushakashatsi-bwagaragaje-inshuro-umuntu-agomba-koga-mu-mutwe
## 4         amakuru/u-rwanda/article/bigogwe-access-to-finance-rwanda-yibutse-abazize-jenoside-igabira-inka
## 5                    amakuru/utuntu-n-utundi/article/kuki-abantu-bambara-imyenda-y-umukara-mu-gushyingura
## 6                   amakuru/mu-mahanga/article/netanyahu-ahangayikishijwe-n-abashaka-kwica-perezida-trump

We use rvest for this then we just parse the HTML data that come, here I am getting some news from Igihe homepage

Data from database management systems

Import from MySQL

library(DBI)
library(RMariaDB)
con <- dbConnect(RMariaDB::MariaDB(), host='localhost', port=3306, user='root', dbname='r')
data <- dbReadTable(con, 'students')
head(data)
##    names math algorithm r_programming id
## 1   Yves   14        15            16  1
## 2 Anitha   16        17            18  2
data2 <- dbGetQuery(con, 'SELECT * FROM students')
head(data2)
##    names math algorithm r_programming id
## 1   Yves   14        15            16  1
## 2 Anitha   16        17            18  2

Import from Access

library(DBI)
library(odbc)
con <- dbConnect(odbc::odbc(), Driver='Microsoft Access Driver (*.mdb, *.accdb)', DBQ='./students.accdb')
data <- dbReadTable(con, 'students')
head(data)
##   ID  names Algorithm Math R.Programming
## 1  1   Yves        15   16            17
## 2  2 Anitha        16   17            18
data2 <- dbGetQuery(con, 'SELECT * FROM students')
head(data2)
##   ID  names Algorithm Math R Programming
## 1  1   Yves        15   16            17
## 2  2 Anitha        16   17            18

Assignment 2: Merging multiple dataframes

world_population <- read.csv('world_population.csv')
co2_emission <- read.csv('CO2_emission.csv')

variable.names(world_population)
##  [1] "Rank"                        "CCA3"                       
##  [3] "Country.Territory"           "Capital"                    
##  [5] "Continent"                   "X2022.Population"           
##  [7] "X2020.Population"            "X2015.Population"           
##  [9] "X2010.Population"            "X2000.Population"           
## [11] "X1990.Population"            "X1980.Population"           
## [13] "X1970.Population"            "Area..km.."                 
## [15] "Density..per.km.."           "Growth.Rate"                
## [17] "World.Population.Percentage"
variable.names(co2_emission)
##  [1] "Country.Name"   "country_code"   "Region"         "Indicator.Name"
##  [5] "X1990"          "X1991"          "X1992"          "X1993"         
##  [9] "X1994"          "X1995"          "X1996"          "X1997"         
## [13] "X1998"          "X1999"          "X2000"          "X2001"         
## [17] "X2002"          "X2003"          "X2004"          "X2005"         
## [21] "X2006"          "X2007"          "X2008"          "X2009"         
## [25] "X2010"          "X2011"          "X2012"          "X2013"         
## [29] "X2014"          "X2015"          "X2016"          "X2017"         
## [33] "X2018"          "X2019"          "X2019.1"
merged = merge(world_population[c('Country.Territory', 'X2010.Population')], co2_emission[c('Country.Name', 'X2010')], all=TRUE, by.x='Country.Territory', by.y='Country.Name')

head(merged)
##   Country.Territory X2010.Population     X2010
## 1       Afghanistan         28189672 0.2436140
## 2           Albania          2913399 1.5276237
## 3           Algeria         35856344 3.1736545
## 4    American Samoa            54849        NA
## 5           Andorra            71519 6.1571978
## 6            Angola         23364185 0.9761842

First I imported 2 datasets (World Population and CO2 Emission) Then after that I check their columns then I decided to join the data of 2010 from World Population and CO2 Emission

Assignment 3: How to use GroupBy

data("iris")
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
iris %>% group_by(Species) %>% summarise(average_sepal_length = mean(Sepal.Length), average_sepal_width=mean(Sepal.Width), average_petal_length = mean(Petal.Length))
## # A tibble: 3 × 4
##   Species    average_sepal_length average_sepal_width average_petal_length
##   <fct>                     <dbl>               <dbl>                <dbl>
## 1 setosa                     5.01                3.43                 1.46
## 2 versicolor                 5.94                2.77                 4.26
## 3 virginica                  6.59                2.97                 5.55

Here we make a take all data and we group into it species then we make some different calculation like average then we can see how each for example Petal Length differ by species

Assignment 4: Trace and Recover

Trace

It is used to know when then function is called. and we use untrace when we no longer want to know that function is called

trace(mean, quote(print('We have traced the mean function')))
## Tracing function "mean" in package "base"
## [1] "mean"
mean(c(1,2,3))
## Tracing mean(c(1, 2, 3)) on entry 
## [1] "We have traced the mean function"
## [1] 2
untrace(mean)
## Untracing function "mean" in package "base"
mean(c(1,2,3))
## [1] 2

Recover

Used when error occure you could do some walkthrough

Assignment 5:

sapply, lapply, vapply, mapply

sapply, lapply and vapply all take input of list but they output are different lapply return always a list, sapply return different things according to the functions return, vapply return what you specify it must return and if it does not much then it fail

sapply
data('iris')
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(sapply(iris['Sepal.Length'], mean))
##  Named num 5.84
##  - attr(*, "names")= chr "Sepal.Length"
str(sapply(iris[c('Sepal.Length','Sepal.Width')], mean))
##  Named num [1:2] 5.84 3.06
##  - attr(*, "names")= chr [1:2] "Sepal.Length" "Sepal.Width"
lapply
data('iris')
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(lapply(iris['Sepal.Length'], mean))
## List of 1
##  $ Sepal.Length: num 5.84
str(lapply(iris[c('Sepal.Length','Sepal.Width')], mean))
## List of 2
##  $ Sepal.Length: num 5.84
##  $ Sepal.Width : num 3.06
vapply
data('iris')
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(vapply(iris[c('Sepal.Length', 'Sepal.Width')], mean, numeric(1)))
##  Named num [1:2] 5.84 3.06
##  - attr(*, "names")= chr [1:2] "Sepal.Length" "Sepal.Width"

mapply

Allow multiple inputs then input it into 1 function

data("women")
head(women)
##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129
bodymas_index <- mapply(function(x,y){
  w <- x * 0.453592
  h <- y * 2.54 / 100
  return (w / (h ^ 2))
}, women['weight'], women['height'])
head(bodymas_index)
##        weight
## [1,] 24.03476
## [2,] 23.63087
## [3,] 23.43563
## [4,] 23.24039
## [5,] 23.04545
## [6,] 22.85107

Assignment 6: Summary

Function will give summary of data including max, min, std deviation, mean, sum, size

mysummary <- function(x) {
  sum <- 0
  size <- 0
  min <- NA
  max <- NA
  for(i in x) {
    if(is.na(min) || is.na(max)) {
      min <- i
      max <- i
    }
    if(is.numeric(i)) {
      sum <- sum + i
      size <- size + 1
      if(min > i) {
        min <- i
      }
      if(max < i) {
        max <- i
      }
    }
  }
  mean <- sum/size
  std <- 0
  for(i in x){
    if(is.numeric(i)) {
      std <- std + ((i - mean) ^ 2)
    }
  }
  
  print(paste("Max: ", max))
  print(paste("Min: ", min))
  print(paste("Sum: ", sum))
  print(paste("Mean: ", mean))
  print(paste("Size: ", size))
  print(paste("Std deviation: ", std))
}
mysummary(c(1,2,3))
## [1] "Max:  3"
## [1] "Min:  1"
## [1] "Sum:  6"
## [1] "Mean:  2"
## [1] "Size:  3"
## [1] "Std deviation:  2"