library('readxl')
excel_data <- read_excel('students.xlsx')
head(excel_data)
## # A tibble: 4 × 4
## Names Algorithm Math `R Programming`
## <chr> <dbl> <dbl> <dbl>
## 1 Yves 16 17 18
## 2 Anitha 14 15 16
## 3 Mugisha 15 16 17
## 4 Uwase 17 16 15
To import from excel we use package called readxl and we first install it if we dont have it then import it then use read_excel and the file path
For import data from statistical software we use haven.
library('haven')
data <- read_sas('accidents.sas7bdat')
head(data)
## # A tibble: 6 × 4
## agecat gender accid pop
## <dbl> <dbl> <dbl> <dbl>
## 1 5.30e-315 5.30e-315 5.38e-315 5.39e-315
## 2 5.30e-315 5.30e-315 5.38e-315 5.39e-315
## 3 5.31e-315 5.30e-315 5.38e-315 5.39e-315
## 4 5.30e-315 0 5.38e-315 5.39e-315
## 5 5.30e-315 0 5.38e-315 5.39e-315
## 6 5.31e-315 0 5.38e-315 5.39e-315
library('haven')
data <- read_sav('accidents.sav')
head(data)
## # A tibble: 6 × 4
## agecat gender accid pop
## <dbl+lbl> <dbl+lbl> <dbl> <dbl>
## 1 1 [Under 21] 1 [Female] 57997 198522
## 2 2 [21-25] 1 [Female] 57113 203200
## 3 3 [26-30] 1 [Female] 54123 200744
## 4 1 [Under 21] 0 [Male] 63936 187791
## 5 2 [21-25] 0 [Male] 64835 195714
## 6 3 [26-30] 0 [Male] 66804 208239
library('haven')
data <- read_dta('accidents.dta')
head(data)
## # A tibble: 6 × 4
## agecat gender accid pop
## <dbl+lbl> <dbl+lbl> <dbl> <dbl>
## 1 1 [Under 21] 1 [Female] 57997 198522
## 2 2 [21-25] 1 [Female] 57113 203200
## 3 3 [26-30] 1 [Female] 54123 200744
## 4 1 [Under 21] 0 [Male] 63936 187791
## 5 2 [21-25] 0 [Male] 64835 195714
## 6 3 [26-30] 0 [Male] 66804 208239
data <- read.table('D:/R/students.txt', sep=',', header=TRUE)
head(data)
## Names Algorithm Math R.Programming
## 1 Yves 16 17 18
## 2 Anitha 14 15 16
## 3 Mugisha 15 16 17
## 4 Uwase 17 16 15
Here we import data from the text file .txt which are rows separated by new line and column by comma
library(xml2)
doc <- read_xml('students.xml')
xml_structure(doc)
## <students>
## <student>
## <name>
## {text}
## <algorithm>
## {text}
## <math>
## {text}
## <r_programming>
## {text}
## <student>
## <name>
## {text}
## <algorithm>
## {text}
## <math>
## {text}
## <r_programming>
## {text}
## <student>
## <name>
## {text}
## <algorithm>
## {text}
## <math>
## {text}
## <r_programming>
## {text}
## <student>
## <name>
## {text}
## <algorithm>
## {text}
## <math>
## {text}
## <r_programming>
## {text}
col_names <- xml_text(xml_find_all(doc, './/name'))
col_math <- xml_text(xml_find_all(doc, './/math'))
col_r <- xml_text(xml_find_all(doc, './/r_programming'))
col_algo <- xml_text(xml_find_all(doc, './/algorithm'))
data <- data.frame(name=col_names, math=col_math, r_progamming=col_r, algorithm=col_algo)
head(data)
## name math r_progamming algorithm
## 1 Yves 17 18 16
## 2 Anitha 15 16 14
## 3 Mugisha 16 17 15
## 4 Uwase 16 15 17
Import data from xml we use xml2 and we just use node names and xpath
library(rvest)
page <- read_html('https://igihe.com/index.php')
news_data <- html_elements(page, '.homenews-title')
news_titles <- html_text(html_elements(news_data, 'a'))
news_links <- html_attr(html_elements(news_data, 'a'), 'href')
data <- data.frame(title=news_titles, link=news_links)
head(data)
## title
## 1 Miliyoni 300 Frw zigiye kwifashishwa mu kubungabunga ibirwa bibiri by’i Rutsiro
## 2 Ikitakwishe kiragukomeza- Shakira ku itandukana rye na Gerard Piqué
## 3 Ubushakashatsi bwagaragaje inshuro umuntu agomba koga mu mutwe
## 4 Bigogwe: Abakozi ba Access to Finance Rwanda bibutse abishwe muri Jenoside, igabira inka abayirokotse
## 5 Kuki abantu bambara imyenda y’umukara mu gushyingura?
## 6 Netanyahu ahangayikishijwe n’abashaka kwica Perezida Trump
## link
## 1 amakuru/u-rwanda/article/miliyoni-300frw-zigiye-kwifashishwa-mu-kubungabunga-ibirwa-bibiri-by-i-rutsiro
## 2 imikino/article/ikitakwishe-kiragukomeza-shakira-ku-itandukana-rye-na-gerard-pique
## 3 ubuzima/article/ubushakashatsi-bwagaragaje-inshuro-umuntu-agomba-koga-mu-mutwe
## 4 amakuru/u-rwanda/article/bigogwe-access-to-finance-rwanda-yibutse-abazize-jenoside-igabira-inka
## 5 amakuru/utuntu-n-utundi/article/kuki-abantu-bambara-imyenda-y-umukara-mu-gushyingura
## 6 amakuru/mu-mahanga/article/netanyahu-ahangayikishijwe-n-abashaka-kwica-perezida-trump
We use rvest for this then we just parse the HTML data that come, here I am getting some news from Igihe homepage
library(DBI)
library(RMariaDB)
con <- dbConnect(RMariaDB::MariaDB(), host='localhost', port=3306, user='root', dbname='r')
data <- dbReadTable(con, 'students')
head(data)
## names math algorithm r_programming id
## 1 Yves 14 15 16 1
## 2 Anitha 16 17 18 2
data2 <- dbGetQuery(con, 'SELECT * FROM students')
head(data2)
## names math algorithm r_programming id
## 1 Yves 14 15 16 1
## 2 Anitha 16 17 18 2
library(DBI)
library(odbc)
con <- dbConnect(odbc::odbc(), Driver='Microsoft Access Driver (*.mdb, *.accdb)', DBQ='./students.accdb')
data <- dbReadTable(con, 'students')
head(data)
## ID names Algorithm Math R.Programming
## 1 1 Yves 15 16 17
## 2 2 Anitha 16 17 18
data2 <- dbGetQuery(con, 'SELECT * FROM students')
head(data2)
## ID names Algorithm Math R Programming
## 1 1 Yves 15 16 17
## 2 2 Anitha 16 17 18
world_population <- read.csv('world_population.csv')
co2_emission <- read.csv('CO2_emission.csv')
variable.names(world_population)
## [1] "Rank" "CCA3"
## [3] "Country.Territory" "Capital"
## [5] "Continent" "X2022.Population"
## [7] "X2020.Population" "X2015.Population"
## [9] "X2010.Population" "X2000.Population"
## [11] "X1990.Population" "X1980.Population"
## [13] "X1970.Population" "Area..km.."
## [15] "Density..per.km.." "Growth.Rate"
## [17] "World.Population.Percentage"
variable.names(co2_emission)
## [1] "Country.Name" "country_code" "Region" "Indicator.Name"
## [5] "X1990" "X1991" "X1992" "X1993"
## [9] "X1994" "X1995" "X1996" "X1997"
## [13] "X1998" "X1999" "X2000" "X2001"
## [17] "X2002" "X2003" "X2004" "X2005"
## [21] "X2006" "X2007" "X2008" "X2009"
## [25] "X2010" "X2011" "X2012" "X2013"
## [29] "X2014" "X2015" "X2016" "X2017"
## [33] "X2018" "X2019" "X2019.1"
merged = merge(world_population[c('Country.Territory', 'X2010.Population')], co2_emission[c('Country.Name', 'X2010')], all=TRUE, by.x='Country.Territory', by.y='Country.Name')
head(merged)
## Country.Territory X2010.Population X2010
## 1 Afghanistan 28189672 0.2436140
## 2 Albania 2913399 1.5276237
## 3 Algeria 35856344 3.1736545
## 4 American Samoa 54849 NA
## 5 Andorra 71519 6.1571978
## 6 Angola 23364185 0.9761842
First I imported 2 datasets (World Population and CO2 Emission) Then after that I check their columns then I decided to join the data of 2010 from World Population and CO2 Emission
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
iris %>% group_by(Species) %>% summarise(average_sepal_length = mean(Sepal.Length), average_sepal_width=mean(Sepal.Width), average_petal_length = mean(Petal.Length))
## # A tibble: 3 × 4
## Species average_sepal_length average_sepal_width average_petal_length
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46
## 2 versicolor 5.94 2.77 4.26
## 3 virginica 6.59 2.97 5.55
Here we make a take all data and we group into it species then we make some different calculation like average then we can see how each for example Petal Length differ by species
It is used to know when then function is called. and we use untrace when we no longer want to know that function is called
trace(mean, quote(print('We have traced the mean function')))
## Tracing function "mean" in package "base"
## [1] "mean"
mean(c(1,2,3))
## Tracing mean(c(1, 2, 3)) on entry
## [1] "We have traced the mean function"
## [1] 2
untrace(mean)
## Untracing function "mean" in package "base"
mean(c(1,2,3))
## [1] 2
Used when error occure you could do some walkthrough
sapply, lapply and vapply all take input of list but they output are different lapply return always a list, sapply return different things according to the functions return, vapply return what you specify it must return and if it does not much then it fail
data('iris')
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(sapply(iris['Sepal.Length'], mean))
## Named num 5.84
## - attr(*, "names")= chr "Sepal.Length"
str(sapply(iris[c('Sepal.Length','Sepal.Width')], mean))
## Named num [1:2] 5.84 3.06
## - attr(*, "names")= chr [1:2] "Sepal.Length" "Sepal.Width"
data('iris')
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(lapply(iris['Sepal.Length'], mean))
## List of 1
## $ Sepal.Length: num 5.84
str(lapply(iris[c('Sepal.Length','Sepal.Width')], mean))
## List of 2
## $ Sepal.Length: num 5.84
## $ Sepal.Width : num 3.06
data('iris')
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(vapply(iris[c('Sepal.Length', 'Sepal.Width')], mean, numeric(1)))
## Named num [1:2] 5.84 3.06
## - attr(*, "names")= chr [1:2] "Sepal.Length" "Sepal.Width"
Allow multiple inputs then input it into 1 function
data("women")
head(women)
## height weight
## 1 58 115
## 2 59 117
## 3 60 120
## 4 61 123
## 5 62 126
## 6 63 129
bodymas_index <- mapply(function(x,y){
w <- x * 0.453592
h <- y * 2.54 / 100
return (w / (h ^ 2))
}, women['weight'], women['height'])
head(bodymas_index)
## weight
## [1,] 24.03476
## [2,] 23.63087
## [3,] 23.43563
## [4,] 23.24039
## [5,] 23.04545
## [6,] 22.85107
Function will give summary of data including max, min, std deviation, mean, sum, size
mysummary <- function(x) {
sum <- 0
size <- 0
min <- NA
max <- NA
for(i in x) {
if(is.na(min) || is.na(max)) {
min <- i
max <- i
}
if(is.numeric(i)) {
sum <- sum + i
size <- size + 1
if(min > i) {
min <- i
}
if(max < i) {
max <- i
}
}
}
mean <- sum/size
std <- 0
for(i in x){
if(is.numeric(i)) {
std <- std + ((i - mean) ^ 2)
}
}
print(paste("Max: ", max))
print(paste("Min: ", min))
print(paste("Sum: ", sum))
print(paste("Mean: ", mean))
print(paste("Size: ", size))
print(paste("Std deviation: ", std))
}
mysummary(c(1,2,3))
## [1] "Max: 3"
## [1] "Min: 1"
## [1] "Sum: 6"
## [1] "Mean: 2"
## [1] "Size: 3"
## [1] "Std deviation: 2"