PS4-Data-Frames

1. Working directory

1.1

The working directory of my RStudio "/Users/meiyaoli" ### 1.2 My homework .rmd file is saved in the folderProblem_Sets,whose absolute path is“/Users/meiyaoli/My Drive/[01]-School-work/AU23-INFO201/Problem_Sets/”` ### 1.3 It matters because the two folders have different relative path to access files.

getwd()

## [1] "/Users/meiyaoli/My Drive/[01]-School-work/AU23-INFO201/Problem_Sets"

2. Create your own data

2.1

names <- c("Amelia", "Emma", "Alex", "Meiyao", "Owin")

2.2

math <- c(80,65,70,89,46)

2.3

japanese <- c(100,79,98,99,87)

2.4

dance <- c(68,33,45,78,36)

2.5

grades <- data.frame(names,math, japanese,dance)
grades

##    names math japanese dance
## 1 Amelia   80      100    68
## 2   Emma   65       79    33
## 3   Alex   70       98    45
## 4 Meiyao   89       99    78
## 5   Owin   46       87    36

2.6

cat("There are",nrow(grades),"students in my data.")

## There are 5 students in my data.

2.7

cat("There are",ncol(grades)-1,"courses in my data.")

## There are 3 courses in my data.

2.8

Print the last two lines of my data.

print(grades[c(nrow(grades)-1,nrow(grades)),])

##    names math japanese dance
## 4 Meiyao   89       99    78
## 5   Owin   46       87    36

2.9

Create a csv file in the data/ directory

library(readr)
write_csv(grades,file="../data/grades.csv")

The path argument is deprecated, use file instead.

2.10

list.files("../data")

## [1] "babynames.csv.bz2"       "grades.csv"             
## [3] "life-expectancy.csv.bz2"

3. Indirect variable names

3.1

col <- "dance"
col

## [1] "dance"

3.2

## dollar sign doesn't work with indirect print(grades$col)
print(grades[[col]])

## [1] 68 33 45 78 36

3.3

# approach 1
col <- "math"
i <- grades[,col]
max(i)

## [1] 89

# approach 2
j <- max(grades[[col]])
max(j)

## [1] 89

3.4

# test if dance col in grades is numeric
is.numeric(grades[[col]])

## [1] TRUE

mean(grades[[col]])

## [1] 70

# test if names col in grades in numeric
is.numeric(grades$names)

## [1] FALSE

# the for loop
for (i in names(grades)){
  if (is.numeric(grades[[i]])==TRUE)
  {
    cat(i,"\n")
    cat("Average value:",mean(grades[[i]]),"\n")
  }
  else
    {
      cat(i,"\n")
    }
}

## names 
## math 
## Average value: 70 
## japanese 
## Average value: 92.6 
## dance 
## Average value: 52

4. Data Manipulations

4.1

Vectorized operations

grades$gpa <- 1/3 * (math + japanese + dance)
grades

##    names math japanese dance      gpa
## 1 Amelia   80      100    68 82.66667
## 2   Emma   65       79    33 59.00000
## 3   Alex   70       98    45 71.00000
## 4 Meiyao   89       99    78 88.66667
## 5   Owin   46       87    36 56.33333

4.2

best <- grades[grades$gpa == max(grades$gpa),]
print(best)

##    names math japanese dance      gpa
## 4 Meiyao   89       99    78 88.66667

## Explanation: locate the row, print all columns of the row by leaving coln selector blank

4.3

# Method 1
print(grades[grades$gpa == max(grades$gpa),"names"])

## [1] "Meiyao"

## locate the row with max gpa, print the name column
# Method 2
print(grades$names[grades$gpa == max(grades$gpa)])

## [1] "Meiyao"

4.4

grades

##    names math japanese dance      gpa
## 1 Amelia   80      100    68 82.66667
## 2   Emma   65       79    33 59.00000
## 3   Alex   70       98    45 71.00000
## 4 Meiyao   89       99    78 88.66667
## 5   Owin   46       87    36 56.33333

Yep, I got the name right

4.5

mj is a logical var of if the student is better at math than Japanese

grades$mj <- grades$math > grades$japanese
grades[0:3,]

##    names math japanese dance      gpa    mj
## 1 Amelia   80      100    68 82.66667 FALSE
## 2   Emma   65       79    33 59.00000 FALSE
## 3   Alex   70       98    45 71.00000 FALSE

4.6

cat(sum(grades$mj == TRUE),"students are better at math than Japanese.")

## 0 students are better at math than Japanese.

5 Life expectancy

5.1 Load Data

5.1.1

life <- read_delim("../data/life-expectancy.csv.bz2")

## Rows: 213 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): name, region
## dbl (6): le1960, le2019, GDP_PC1960, GDP_PC2019, population1960, population2019
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

life

## # A tibble: 213 × 8
##    name                region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960
##    <chr>               <chr>   <dbl>  <dbl>      <dbl>      <dbl>          <dbl>
##  1 Aruba               Ameri…   65.7   76.3        NA         NA           54211
##  2 Afghanistan         Asia     32.4   64.8        NA        573.        8996973
##  3 Angola              Africa   37.5   61.1        NA       3111.        5454933
##  4 Albania             Europe   62.3   78.6        NA       5211.        1608800
##  5 Andorra             Europe   NA     NA          NA      45887.          13411
##  6 United Arab Emirat… Asia     51.5   78.0        NA      41420.          92418
##  7 Argentina           Ameri…   65.1   76.7      5643.      9742.       20481779
##  8 Armenia             Asia     66.0   75.1        NA       4732.        1874121
##  9 American Samoa      Ocean…   NA     NA          NA         NA           20123
## 10 Antigua and Barbuda Ameri…   62.0   77.0        NA      15704.          54131
## # ℹ 203 more rows
## # ℹ 1 more variable: population2019 <dbl>

5.1.2

cat(nrow(life),"rows")

## 213 rows

cat(ncol(life),"columns")

## 8 columns

5.1.3

print(names(life))

## [1] "name"           "region"         "le1960"         "le2019"        
## [5] "GDP_PC1960"     "GDP_PC2019"     "population1960" "population2019"

5.1.4

The few lines of life-expectancy data printed below looks good because despite it is randomized (sample-size = 5), the data is a mix of text, numeric values, and missing values (NA).

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

life %>% 
  sample_n(5)

## # A tibble: 5 × 8
##   name  region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960 population2019
##   <chr> <chr>   <dbl>  <dbl>      <dbl>      <dbl>          <dbl>          <dbl>
## 1 Mala… Asia     60.0   76.2      1354.     12487.        8156347       31949777
## 2 Niger Africa   35.1   62.4       826.       563.        3388764       23310715
## 3 Hond… Ameri…   46.3   75.3      1096.      2241.        2038632        9746117
## 4 Iran… Asia     44.9   76.7      2979.      5923.       21906914       82913906
## 5 Pola… Europe   67.7   77.9        NA      17409.       29637450       37965475

5.1.5

Each row in this dataset represent a country’s life expectancy at birth, GDP per capita, and population at years 1960 and 2019.

5.1.6

# empty vector to store the result
na_counts <- numeric()

# test is.na function
is.na(life["Dominica", "le1960"])

##      le1960
## [1,]   TRUE

# how to scan thro each row in a column, and return number of NA rows
sum(is.na(life[,"le1960"]))

## [1] 25

# (1) is.na() returns a vec of true and false
# (2) sum() counts the number of TRUE values 


# for loop
for(col in names(life))
{
  cat(col,"\n")
  cat(sum(is.na(life[,col])),"NA values\n")
}

## name 
## 0 NA values
## region 
## 0 NA values
## le1960 
## 25 NA values
## le2019 
## 17 NA values
## GDP_PC1960 
## 122 NA values
## GDP_PC2019 
## 29 NA values
## population1960 
## 1 NA values
## population2019 
## 1 NA values

As we can see, the name and region values are good, whereas the le1960, le2019, GDP_PC1960 and GDP_PC2019 are not so good.

5.2 Analyze data

5.2.1

Add new variable: the growth in life expectancy from 1960 to 2019

life %>% 
  mutate(growth = le2019 - le1960) %>% 
  sample_n(5)

## # A tibble: 5 × 9
##   name  region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960 population2019
##   <chr> <chr>   <dbl>  <dbl>      <dbl>      <dbl>          <dbl>          <dbl>
## 1 Aust… Ocean…   70.8   82.9     19378.     57183.       10276477       25365745
## 2 Palau Ocean…   NA     NA          NA      12078.           9771          18008
## 3 Azer… Asia     61.0   73.0        NA       5879.        3895397       10024283
## 4 Hong… Asia     67.0   85.1        NA      37928.        3075605        7507400
## 5 Aust… Europe   68.6   81.8     13031.     50537.        7047539        8879920
## # ℹ 1 more variable: growth <dbl>

#Alternate method:
life$growth <- life$le2019 - life$le1960
head(life)

## # A tibble: 6 × 9
##   name  region le1960 le2019 GDP_PC1960 GDP_PC2019 population1960 population2019
##   <chr> <chr>   <dbl>  <dbl>      <dbl>      <dbl>          <dbl>          <dbl>
## 1 Aruba Ameri…   65.7   76.3         NA        NA           54211         106314
## 2 Afgh… Asia     32.4   64.8         NA       573.        8996973       38041754
## 3 Ango… Africa   37.5   61.1         NA      3111.        5454933       31825295
## 4 Alba… Europe   62.3   78.6         NA      5211.        1608800        2854191
## 5 Ando… Europe   NA     NA           NA     45887.          13411          77142
## 6 Unit… Asia     51.5   78.0         NA     41420.          92418        9770529
## # ℹ 1 more variable: growth <dbl>

5.2.2

Average improvement in LE over these years

# a vec of LE for all rows
#life$growth

#Below returns NA because NA values are not filtered
#mean(life$growth)

#Correct way
life %>% 
  filter(is.na(growth) == FALSE) %>% 
  select(growth) %>% ## return data frame not vec
  unlist %>% 
  mean() %>% 
  cat("is the average improvement in LE over these years.")

## 18.79684 is the average improvement in LE over these years.

#### summarize vs. mean() function
life %>% 
  filter(is.na(growth) == FALSE) %>% 
  select(growth) %>%
  summarize(mean=mean(growth))

## # A tibble: 1 × 1
##    mean
##   <dbl>
## 1  18.8

5.2.3

Important notes - unlist the data frame so that cat can handle the pipe output - cat(“additional message”), no need to add comma in front of it

life %>% 
  arrange(desc(growth)) %>% 
  head(1) %>% 
  select(name) %>% 
  unlist %>% 
  cat("gained the most in terms of LE\n")

## Maldives gained the most in terms of LE

life %>% 
  arrange(growth) %>% 
  head(1) %>% 
  select(name) %>% 
  unlist %>% 
  cat("gained the least in terms of LE\n")

## Ukraine gained the least in terms of LE

5.2.4

life %>% 
  filter(growth < 5) %>% 
  nrow() %>% 
  cat("countr(ies) have their LE improved less than 5 years")

## 1 countr(ies) have their LE improved less than 5 years

5.2.5

life %>% 
  filter(growth < 0) %>% 
  nrow() %>% 
  cat("countr(ies) have their LE decreased over this time period")

## 0 countr(ies) have their LE decreased over this time period