The working directory of my RStudio
"/Users/meiyaoli" ### 1.2 My homework .rmd file is saved in the folderProblem_Sets,whose absolute path is“/Users/meiyaoli/My
Drive/[01]-School-work/AU23-INFO201/Problem_Sets/”` ### 1.3 It matters
because the two folders have different relative path to access
files.
getwd()
## [1] "/Users/meiyaoli/My Drive/[01]-School-work/AU23-INFO201/Problem_Sets"
names <- c("Amelia", "Emma", "Alex", "Meiyao", "Owin")
math <- c(80,65,70,89,46)
japanese <- c(100,79,98,99,87)
dance <- c(68,33,45,78,36)
grades <- data.frame(names,math, japanese,dance)
grades
## names math japanese dance
## 1 Amelia 80 100 68
## 2 Emma 65 79 33
## 3 Alex 70 98 45
## 4 Meiyao 89 99 78
## 5 Owin 46 87 36
cat("There are",nrow(grades),"students in my data.")
## There are 5 students in my data.
cat("There are",ncol(grades)-1,"courses in my data.")
## There are 3 courses in my data.
Print the last two lines of my data.
print(grades[c(nrow(grades)-1,nrow(grades)),])
## names math japanese dance
## 4 Meiyao 89 99 78
## 5 Owin 46 87 36
Create a csv file in the data/ directory
library(readr)
write_csv(grades,file="../data/grades.csv")
The path argument is deprecated, use
file instead.
list.files("../data")
## [1] "babynames.csv.bz2" "grades.csv"
## [3] "life-expectancy.csv.bz2"
col <- "dance"
col
## [1] "dance"
## dollar sign doesn't work with indirect print(grades$col)
print(grades[[col]])
## [1] 68 33 45 78 36
# approach 1
col <- "math"
i <- grades[,col]
max(i)
## [1] 89
# approach 2
j <- max(grades[[col]])
max(j)
## [1] 89
# test if dance col in grades is numeric
is.numeric(grades[[col]])
## [1] TRUE
mean(grades[[col]])
## [1] 70
# test if names col in grades in numeric
is.numeric(grades$names)
## [1] FALSE
# the for loop
for (i in names(grades)){
if (is.numeric(grades[[i]])==TRUE)
{
cat(i,"\n")
cat("Average value:",mean(grades[[i]]),"\n")
}
else
{
cat(i,"\n")
}
}
## names
## math
## Average value: 70
## japanese
## Average value: 92.6
## dance
## Average value: 52
Vectorized operations
grades$gpa <- 1/3 * (math + japanese + dance)
grades
## names math japanese dance gpa
## 1 Amelia 80 100 68 82.66667
## 2 Emma 65 79 33 59.00000
## 3 Alex 70 98 45 71.00000
## 4 Meiyao 89 99 78 88.66667
## 5 Owin 46 87 36 56.33333
best <- grades[grades$gpa == max(grades$gpa),]
print(best)
## names math japanese dance gpa
## 4 Meiyao 89 99 78 88.66667
## Explanation: locate the row, print all columns of the row by leaving coln selector blank
# Method 1
print(grades[grades$gpa == max(grades$gpa),"names"])
## [1] "Meiyao"
## locate the row with max gpa, print the name column
# Method 2
print(grades$names[grades$gpa == max(grades$gpa)])
## [1] "Meiyao"
grades
## names math japanese dance gpa
## 1 Amelia 80 100 68 82.66667
## 2 Emma 65 79 33 59.00000
## 3 Alex 70 98 45 71.00000
## 4 Meiyao 89 99 78 88.66667
## 5 Owin 46 87 36 56.33333
Yep, I got the name right
mj is a logical var of if the student is better at math than Japanese
grades$mj <- grades$math > grades$japanese
grades[0:3,]
## names math japanese dance gpa mj
## 1 Amelia 80 100 68 82.66667 FALSE
## 2 Emma 65 79 33 59.00000 FALSE
## 3 Alex 70 98 45 71.00000 FALSE
cat(sum(grades$mj == TRUE),"students are better at math than Japanese.")
## 0 students are better at math than Japanese.
life <- read_delim("../data/life-expectancy.csv.bz2")
## Rows: 213 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): name, region
## dbl (6): le1960, le2019, GDP_PC1960, GDP_PC2019, population1960, population2019
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
life
## # A tibble: 213 × 8
## name region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba Ameri… 65.7 76.3 NA NA 54211
## 2 Afghanistan Asia 32.4 64.8 NA 573. 8996973
## 3 Angola Africa 37.5 61.1 NA 3111. 5454933
## 4 Albania Europe 62.3 78.6 NA 5211. 1608800
## 5 Andorra Europe NA NA NA 45887. 13411
## 6 United Arab Emirat… Asia 51.5 78.0 NA 41420. 92418
## 7 Argentina Ameri… 65.1 76.7 5643. 9742. 20481779
## 8 Armenia Asia 66.0 75.1 NA 4732. 1874121
## 9 American Samoa Ocean… NA NA NA NA 20123
## 10 Antigua and Barbuda Ameri… 62.0 77.0 NA 15704. 54131
## # ℹ 203 more rows
## # ℹ 1 more variable: population2019 <dbl>
cat(nrow(life),"rows")
## 213 rows
cat(ncol(life),"columns")
## 8 columns
print(names(life))
## [1] "name" "region" "le1960" "le2019"
## [5] "GDP_PC1960" "GDP_PC2019" "population1960" "population2019"
The few lines of life-expectancy data printed below looks good because despite it is randomized (sample-size = 5), the data is a mix of text, numeric values, and missing values (NA).
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
life %>%
sample_n(5)
## # A tibble: 5 × 8
## name region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960 population2019
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mala… Asia 60.0 76.2 1354. 12487. 8156347 31949777
## 2 Niger Africa 35.1 62.4 826. 563. 3388764 23310715
## 3 Hond… Ameri… 46.3 75.3 1096. 2241. 2038632 9746117
## 4 Iran… Asia 44.9 76.7 2979. 5923. 21906914 82913906
## 5 Pola… Europe 67.7 77.9 NA 17409. 29637450 37965475
Each row in this dataset represent a country’s life expectancy at birth, GDP per capita, and population at years 1960 and 2019.
# empty vector to store the result
na_counts <- numeric()
# test is.na function
is.na(life["Dominica", "le1960"])
## le1960
## [1,] TRUE
# how to scan thro each row in a column, and return number of NA rows
sum(is.na(life[,"le1960"]))
## [1] 25
# (1) is.na() returns a vec of true and false
# (2) sum() counts the number of TRUE values
# for loop
for(col in names(life))
{
cat(col,"\n")
cat(sum(is.na(life[,col])),"NA values\n")
}
## name
## 0 NA values
## region
## 0 NA values
## le1960
## 25 NA values
## le2019
## 17 NA values
## GDP_PC1960
## 122 NA values
## GDP_PC2019
## 29 NA values
## population1960
## 1 NA values
## population2019
## 1 NA values
As we can see, the name and region values are good, whereas the le1960, le2019, GDP_PC1960 and GDP_PC2019 are not so good.
Add new variable: the growth in life expectancy from 1960 to 2019
life %>%
mutate(growth = le2019 - le1960) %>%
sample_n(5)
## # A tibble: 5 × 9
## name region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960 population2019
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aust… Ocean… 70.8 82.9 19378. 57183. 10276477 25365745
## 2 Palau Ocean… NA NA NA 12078. 9771 18008
## 3 Azer… Asia 61.0 73.0 NA 5879. 3895397 10024283
## 4 Hong… Asia 67.0 85.1 NA 37928. 3075605 7507400
## 5 Aust… Europe 68.6 81.8 13031. 50537. 7047539 8879920
## # ℹ 1 more variable: growth <dbl>
#Alternate method:
life$growth <- life$le2019 - life$le1960
head(life)
## # A tibble: 6 × 9
## name region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960 population2019
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba Ameri… 65.7 76.3 NA NA 54211 106314
## 2 Afgh… Asia 32.4 64.8 NA 573. 8996973 38041754
## 3 Ango… Africa 37.5 61.1 NA 3111. 5454933 31825295
## 4 Alba… Europe 62.3 78.6 NA 5211. 1608800 2854191
## 5 Ando… Europe NA NA NA 45887. 13411 77142
## 6 Unit… Asia 51.5 78.0 NA 41420. 92418 9770529
## # ℹ 1 more variable: growth <dbl>
Average improvement in LE over these years
# a vec of LE for all rows
#life$growth
#Below returns NA because NA values are not filtered
#mean(life$growth)
#Correct way
life %>%
filter(is.na(growth) == FALSE) %>%
select(growth) %>% ## return data frame not vec
unlist %>%
mean() %>%
cat("is the average improvement in LE over these years.")
## 18.79684 is the average improvement in LE over these years.
#### summarize vs. mean() function
life %>%
filter(is.na(growth) == FALSE) %>%
select(growth) %>%
summarize(mean=mean(growth))
## # A tibble: 1 × 1
## mean
## <dbl>
## 1 18.8
Important notes - unlist the data frame so that cat can handle the pipe output - cat(“additional message”), no need to add comma in front of it
life %>%
arrange(desc(growth)) %>%
head(1) %>%
select(name) %>%
unlist %>%
cat("gained the most in terms of LE\n")
## Maldives gained the most in terms of LE
life %>%
arrange(growth) %>%
head(1) %>%
select(name) %>%
unlist %>%
cat("gained the least in terms of LE\n")
## Ukraine gained the least in terms of LE
life %>%
filter(growth < 5) %>%
nrow() %>%
cat("countr(ies) have their LE improved less than 5 years")
## 1 countr(ies) have their LE improved less than 5 years
life %>%
filter(growth < 0) %>%
nrow() %>%
cat("countr(ies) have their LE decreased over this time period")
## 0 countr(ies) have their LE decreased over this time period