Data frame primer

Basics on Building dataframes and tibbles

#create two vectors

test_scores <- c(92, 75, 84, 94, 88, 89, 91)

students <- c("Jerry", "Monica", "Felix", "James", "April", "Ruth", "Tony")

#create a dataframe
grades <- data.frame(test_scores,students)
grades

##   test_scores students
## 1          92    Jerry
## 2          75   Monica
## 3          84    Felix
## 4          94    James
## 5          88    April
## 6          89     Ruth
## 7          91     Tony

class(grades)

## [1] "data.frame"

#create a matrix(by mistake)
gradesmatrix <-  cbind(test_scores, students)
class(gradesmatrix)

## [1] "matrix" "array"

#create a tibble
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

grades <- tibble(test_scores, students)
grades

## # A tibble: 7 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          92 Jerry   
## 2          75 Monica  
## 3          84 Felix   
## 4          94 James   
## 5          88 April   
## 6          89 Ruth    
## 7          91 Tony

#update a value
grades$test_scores[2] <- NA
grades

## # A tibble: 7 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          92 Jerry   
## 2          NA Monica  
## 3          84 Felix   
## 4          94 James   
## 5          88 April   
## 6          89 Ruth    
## 7          91 Tony

#query
grades$students[grades$test_scores >= 90]

## [1] "Jerry" NA      "James" "Tony"

#query to show both students and grades >=90
grades[grades$test_scores >=90, ] #return all columns

## # A tibble: 4 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          92 Jerry   
## 2          NA <NA>    
## 3          94 James   
## 4          91 Tony

#query non NA values
complete.cases(grades)

## [1]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE

grades[complete.cases(grades),]

## # A tibble: 6 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          92 Jerry   
## 2          84 Felix   
## 3          94 James   
## 4          88 April   
## 5          89 Ruth    
## 6          91 Tony

grades <- grades[complete.cases(grades),]
grades[grades$test_scores >=90, ]

## # A tibble: 3 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          92 Jerry   
## 2          94 James   
## 3          91 Tony

#sort in descending order 
grades[order(-test_scores, students),1:2] #omitting 1:2 will default to showing all columns

## # A tibble: 7 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          88 April   
## 2          92 Jerry   
## 3          NA <NA>    
## 4          91 Tony    
## 5          89 Ruth    
## 6          94 James   
## 7          84 Felix

cleangrades<- complete.cases(grades)

# sort in ascending order and specify the columns to show
grades[order(test_scores),1]

## # A tibble: 7 x 1
##   test_scores
##         <dbl>
## 1          84
## 2          94
## 3          89
## 4          91
## 5          NA
## 6          92
## 7          88

#or
library(dplyr)
arrange(grades, -test_scores, )

## # A tibble: 6 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          94 James   
## 2          92 Jerry   
## 3          91 Tony    
## 4          89 Ruth    
## 5          88 April   
## 6          84 Felix

filter(grades, test_scores >= 85)

## # A tibble: 5 x 2
##   test_scores students
##         <dbl> <chr>   
## 1          92 Jerry   
## 2          94 James   
## 3          88 April   
## 4          89 Ruth    
## 5          91 Tony

grades %>% select(test_scores)

## # A tibble: 6 x 1
##   test_scores
##         <dbl>
## 1          92
## 2          84
## 3          94
## 4          88
## 5          89
## 6          91

grades %>% select(test_scores) %>%  filter(test_scores >= 85)

## # A tibble: 5 x 1
##   test_scores
##         <dbl>
## 1          92
## 2          94
## 3          88
## 4          89
## 5          91

Importing data

library(readr)
options(scipen=999) #convert from sn
gdp <- read_csv("gdp.csv")

Accessing columns and rows

Use the square brackets. Separate by rows and columns

gdp[1,] #Shows first row with all variables

## # A tibble: 1 x 60
##   `Country Name` `Country Code` `1960` `1961` `1962` `1963` `1964` `1965` `1966`
##   <chr>          <chr>           <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Aruba          ABW                NA     NA     NA     NA     NA     NA     NA
## # … with 51 more variables: 1967 <dbl>, 1968 <dbl>, 1969 <dbl>, 1970 <dbl>,
## #   1971 <dbl>, 1972 <dbl>, 1973 <dbl>, 1974 <dbl>, 1975 <dbl>, 1976 <dbl>,
## #   1977 <dbl>, 1978 <dbl>, 1979 <dbl>, 1980 <dbl>, 1981 <dbl>, 1982 <dbl>,
## #   1983 <dbl>, 1984 <dbl>, 1985 <dbl>, 1986 <dbl>, 1987 <dbl>, 1988 <dbl>,
## #   1989 <dbl>, 1990 <dbl>, 1991 <dbl>, 1992 <dbl>, 1993 <dbl>, 1994 <dbl>,
## #   1995 <dbl>, 1996 <dbl>, 1997 <dbl>, 1998 <dbl>, 1999 <dbl>, 2000 <dbl>,
## #   2001 <dbl>, 2002 <dbl>, 2003 <dbl>, 2004 <dbl>, 2005 <dbl>, 2006 <dbl>,
## #   2007 <dbl>, 2008 <dbl>, 2009 <dbl>, 2010 <dbl>, 2011 <dbl>, 2012 <dbl>,
## #   2013 <dbl>, 2014 <dbl>, 2015 <dbl>, 2016 <dbl>, 2017 <dbl>

gdp[,1] #shows Country Name variable and all rows

## # A tibble: 264 x 1
##    `Country Name`      
##    <chr>               
##  1 Aruba               
##  2 Afghanistan         
##  3 Angola              
##  4 Albania             
##  5 Andorra             
##  6 Arab World          
##  7 United Arab Emirates
##  8 Argentina           
##  9 Armenia             
## 10 American Samoa      
## # … with 254 more rows

To do an unordered selection of columns and rows, use `c()`

gdp[c(1:6),c(1,2,57:60)] #rows 1:6, columns 1,2, and 57 through 60 inclusive

## # A tibble: 6 x 6
##   `Country Name` `Country Code`   `2014`   `2015`   `2016`   `2017`
##   <chr>          <chr>             <dbl>    <dbl>    <dbl>    <dbl>
## 1 Aruba          ABW            NA       NA       NA       NA      
## 2 Afghanistan    AFG             2.06e10  1.92e10  1.95e10  2.08e10
## 3 Angola         AGO             1.27e11  1.03e11  9.53e10  1.24e11
## 4 Albania        ALB             1.32e10  1.14e10  1.19e10  1.30e10
## 5 Andorra        AND             3.35e 9  2.81e 9  2.88e 9  3.01e 9
## 6 Arab World     ARB             2.91e12  2.55e12  2.50e12  2.59e12

Adding a new column way 1

gdp$silly_column <- NA #
print(gdp$newcolumn)

## Warning: Unknown or uninitialised column: `newcolumn`.

## NULL

Check to see that a new variable was added

dim(gdp)

## [1] 264  61

Adding a new column way 2

gdp["anothercolumn"] <- NA #
print(gdp$anothercolumn)

##   [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [101] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [126] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [151] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [176] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [201] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [226] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [251] NA NA NA NA NA NA NA NA NA NA NA NA NA NA

dim(gdp)

## [1] 264  62

Reordering data frame columns

http://www.sthda.com/english/wiki/reordering-data-frame-columns-in-r

gdp2 <- gdp[, c(1, 2:62)]
gdp2

## # A tibble: 264 x 62
##    `Country Name`   `Country Code`   `1960`   `1961`    `1962`   `1963`   `1964`
##    <chr>            <chr>             <dbl>    <dbl>     <dbl>    <dbl>    <dbl>
##  1 Aruba            ABW             NA       NA       NA       NA       NA      
##  2 Afghanistan      AFG              5.38e8   5.49e8   5.47e 8  7.51e 8  8.00e 8
##  3 Angola           AGO             NA       NA       NA       NA       NA      
##  4 Albania          ALB             NA       NA       NA       NA       NA      
##  5 Andorra          AND             NA       NA       NA       NA       NA      
##  6 Arab World       ARB             NA       NA       NA       NA       NA      
##  7 United Arab Emi… ARE             NA       NA       NA       NA       NA      
##  8 Argentina        ARG             NA       NA        2.45e10  1.83e10  2.56e10
##  9 Armenia          ARM             NA       NA       NA       NA       NA      
## 10 American Samoa   ASM             NA       NA       NA       NA       NA      
## # … with 254 more rows, and 55 more variables: 1965 <dbl>, 1966 <dbl>,
## #   1967 <dbl>, 1968 <dbl>, 1969 <dbl>, 1970 <dbl>, 1971 <dbl>, 1972 <dbl>,
## #   1973 <dbl>, 1974 <dbl>, 1975 <dbl>, 1976 <dbl>, 1977 <dbl>, 1978 <dbl>,
## #   1979 <dbl>, 1980 <dbl>, 1981 <dbl>, 1982 <dbl>, 1983 <dbl>, 1984 <dbl>,
## #   1985 <dbl>, 1986 <dbl>, 1987 <dbl>, 1988 <dbl>, 1989 <dbl>, 1990 <dbl>,
## #   1991 <dbl>, 1992 <dbl>, 1993 <dbl>, 1994 <dbl>, 1995 <dbl>, 1996 <dbl>,
## #   1997 <dbl>, 1998 <dbl>, 1999 <dbl>, 2000 <dbl>, 2001 <dbl>, 2002 <dbl>,
## #   2003 <dbl>, 2004 <dbl>, 2005 <dbl>, 2006 <dbl>, 2007 <dbl>, 2008 <dbl>,
## #   2009 <dbl>, 2010 <dbl>, 2011 <dbl>, 2012 <dbl>, 2013 <dbl>, 2014 <dbl>,
## #   2015 <dbl>, 2016 <dbl>, 2017 <dbl>, silly_column <lgl>, anothercolumn <lgl>

Removing a column

library(dplyr )
gdp2 <- select(gdp2, -anothercolumn)

dim(gdp2)

## [1] 264  61

Learning more about our data

str(gdp2)

## tibble[,61] [264 × 61] (S3: tbl_df/tbl/data.frame)
##  $ Country Name: chr [1:264] "Aruba" "Afghanistan" "Angola" "Albania" ...
##  $ Country Code: chr [1:264] "ABW" "AFG" "AGO" "ALB" ...
##  $ 1960        : num [1:264] NA 537777811 NA NA NA ...
##  $ 1961        : num [1:264] NA 548888896 NA NA NA ...
##  $ 1962        : num [1:264] NA 546666678 NA NA NA ...
##  $ 1963        : num [1:264] NA 751111191 NA NA NA ...
##  $ 1964        : num [1:264] NA 800000044 NA NA NA ...
##  $ 1965        : num [1:264] NA 1006666638 NA NA NA ...
##  $ 1966        : num [1:264] NA 1399999967 NA NA NA ...
##  $ 1967        : num [1:264] NA 1673333418 NA NA NA ...
##  $ 1968        : num [1:264] NA 1373333367 NA NA NA ...
##  $ 1969        : num [1:264] NA 1408888922 NA NA NA ...
##  $ 1970        : num [1:264] NA 1748886596 NA NA 78619206 ...
##  $ 1971        : num [1:264] NA 1831108971 NA NA 89409820 ...
##  $ 1972        : num [1:264] NA 1595555476 NA NA 113408232 ...
##  $ 1973        : num [1:264] NA 1733333264 NA NA 150820103 ...
##  $ 1974        : num [1:264] NA 2155555498 NA NA 186558696 ...
##  $ 1975        : num [1:264] NA 2366666616 NA NA 220127246 ...
##  $ 1976        : num [1:264] NA 2555555567 NA NA 227281025 ...
##  $ 1977        : num [1:264] NA 2953333418 NA NA 254020153 ...
##  $ 1978        : num [1:264] NA 3300000109 NA NA 308008898 ...
##  $ 1979        : num [1:264] NA 3697940410 NA NA 411578334 ...
##  $ 1980        : num [1:264] NA 3641723322 5930503401 NA 446416106 ...
##  $ 1981        : num [1:264] NA 3478787909 5550483036 NA 388958731 ...
##  $ 1982        : num [1:264] NA NA 5550483036 NA 375895956 ...
##  $ 1983        : num [1:264] NA NA 5784341596 NA 327861833 ...
##  $ 1984        : num [1:264] NA NA 6131475065 1924242453 330070689 ...
##  $ 1985        : num [1:264] NA NA 7553560459 1965384586 346737965 ...
##  $ 1986        : num [1:264] NA NA 7072063345 2173750012 482000594 ...
##  $ 1987        : num [1:264] NA NA 8083872012 2156624900 611316399 ...
##  $ 1988        : num [1:264] NA NA 8769250550 2126000000 721425939 ...
##  $ 1989        : num [1:264] NA NA 10201099040 2335124988 795449332 ...
##  $ 1990        : num [1:264] NA NA 11228764963 2101624962 1029048482 ...
##  $ 1991        : num [1:264] NA NA 10603784541 1139166646 1106928583 ...
##  $ 1992        : num [1:264] NA NA 8307810974 709452584 1210013652 ...
##  $ 1993        : num [1:264] NA NA 5768720422 1228071038 1007025755 ...
##  $ 1994        : num [1:264] 1330167598 NA 4438321017 1985673798 1017549124 ...
##  $ 1995        : num [1:264] 1320670391 NA 5538749260 2424499009 1178738991 ...
##  $ 1996        : num [1:264] 1379888268 NA 7526446606 3314898292 1223945357 ...
##  $ 1997        : num [1:264] 1531843575 NA 7648377413 2359903108 1180597273 ...
##  $ 1998        : num [1:264] 1665363128 NA 6506229607 2707123772 1211932398 ...
##  $ 1999        : num [1:264] 1722798883 NA 6152922943 3414760915 1239876305 ...
##  $ 2000        : num [1:264] 1873452514 NA 9129594819 3632043908 1434429703 ...
##  $ 2001        : num [1:264] 1920262570 2461665938 8936063723 4060758804 1496912752 ...
##  $ 2002        : num [1:264] 1941094972 4128820723 12497347956 4435078648 1733116883 ...
##  $ 2003        : num [1:264] 2021301676 4583644246 14188949398 5746945913 2398645598 ...
##  $ 2004        : num [1:264] 2228279330 5285465686 19640853734 7314865176 2935659300 ...
##  $ 2005        : num [1:264] 2331005587 6275073572 28233712738 8158548717 3255789081 ...
##  $ 2006        : num [1:264] 2421474860 7057598407 41789479932 8992642349 3543256806 ...
##  $ 2007        : num [1:264] 2623726257 9843842455 60448924662 10701011897 4016972351 ...
##  $ 2008        : num [1:264] 2791960894 10190529882 84178035579 12881352688 4007353157 ...
##  $ 2009        : num [1:264] 2498932961 12486943506 75492385928 12044212904 3660530703 ...
##  $ 2010        : num [1:264] 2467703911 15936800636 82526143645 11926953259 3355695364 ...
##  $ 2011        : num [1:264] 2584463687 17930239400 104115807986 12890867539 3442062830 ...
##  $ 2012        : num [1:264] NA 20536542737 113923162050 12319784787 3164615187 ...
##  $ 2013        : num [1:264] NA 20264253974 124912503781 12776277515 3281585236 ...
##  $ 2014        : num [1:264] NA 20616104298 126730196125 13228244357 3350736367 ...
##  $ 2015        : num [1:264] NA 19215562179 102621215573 11386931490 2811489409 ...
##  $ 2016        : num [1:264] NA 19469022208 95337203468 11883682171 2877311947 ...
##  $ 2017        : num [1:264] NA 20815300220 124209385825 13039352744 3012914131 ...
##  $ silly_column: logi [1:264] NA NA NA NA NA NA ...

Summary

summary(gdp2)

Converting to log scale

This is the basic logarithm function with 9 as the value and 3 as the base. The results are 2 because 9 is the square of 3.

# log in r - core syntax
log(9,3)

## [1] 2

Here, the second parameter has been omitted resulting in a base of e producing the natural logarithm of 5.

log(5)

## [1] 1.609438

log(5,2.718)

## [1] 1.609605

Inverse

exp(1.609438)

## [1] 5

Log transformation

A log transformation is a process of applying a logarithm to data to reduce its skew. This is usually done when the numbers are highly skewed to reduce the skew so the data can be understood easier. Log transformation in R is accomplished by applying the log() function to vector, data-frame or other data set. Before the logarithm is applied, 1 is added to the base value to prevent applying a logarithm to a 0 value. The resulting presentation of the data is less skewed than the original making it easier to understand.

myvector = c(100,10,5,2,1,0.5,0.1,0.05,0.01,0.001,0.0001)
transformedvector=log10(myvector+1)

plot(myvector)

plot(transformedvector)

data frame

Log transforming your data in R for a data frame is a little trickier because getting the log requires separating the data. Taking the log of the entire dataset gets you the log of each data point. However, you usually need the log from only one column of data.

 ChickWeight$logweight=log(ChickWeight$weight)
#head(ChickWeight)

plot(head(ChickWeight$Time),head(ChickWeight$logweight))

plot(head(ChickWeight$Time),head(ChickWeight$weight))

Visualize it

boxplot(gdp$`2017`[1:10])

#boxplot(gdp$`2017`)

Let's apply this to a column in GDP

gdp[,3:60] <- log(gdp[,3:60] + 1)
#(or use log1p(x) computes log(1+x) accurately)

Learn more about using the natural logarithm with economic data: https://econbrowser.com/archives/2014/02/use-of-logarithms-in-economics

Subsetting

gdp <- read_csv("gdp.csv")
uae <- subset(gdp, `Country Code` == "ARE")

library(dplyr)
uae2 = filter(gdp, `Country Code` == "ARE")
uae2

## # A tibble: 1 x 60
##   `Country Name` `Country Code` `1960` `1961` `1962` `1963` `1964` `1965` `1966`
##   <chr>          <chr>           <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 United Arab E… ARE                NA     NA     NA     NA     NA     NA     NA
## # … with 51 more variables: 1967 <dbl>, 1968 <dbl>, 1969 <dbl>, 1970 <dbl>,
## #   1971 <dbl>, 1972 <dbl>, 1973 <dbl>, 1974 <dbl>, 1975 <dbl>, 1976 <dbl>,
## #   1977 <dbl>, 1978 <dbl>, 1979 <dbl>, 1980 <dbl>, 1981 <dbl>, 1982 <dbl>,
## #   1983 <dbl>, 1984 <dbl>, 1985 <dbl>, 1986 <dbl>, 1987 <dbl>, 1988 <dbl>,
## #   1989 <dbl>, 1990 <dbl>, 1991 <dbl>, 1992 <dbl>, 1993 <dbl>, 1994 <dbl>,
## #   1995 <dbl>, 1996 <dbl>, 1997 <dbl>, 1998 <dbl>, 1999 <dbl>, 2000 <dbl>,
## #   2001 <dbl>, 2002 <dbl>, 2003 <dbl>, 2004 <dbl>, 2005 <dbl>, 2006 <dbl>,
## #   2007 <dbl>, 2008 <dbl>, 2009 <dbl>, 2010 <dbl>, 2011 <dbl>, 2012 <dbl>,
## #   2013 <dbl>, 2014 <dbl>, 2015 <dbl>, 2016 <dbl>, 2017 <dbl>

Single column - Multiple values

gdp_multiple= subset(gdp, `Country Code` %in% c("ARE", "CHN", "GBR"))

gdp_multiple2 = filter(gdp, `Country Code` == "ARE" | `Country Code` == "CHN" | `Country Code` == "GBR")

Transforming using the `gather` function.

library(dplyr)
library(tidyverse)
gdp_multiple2_melt <- 
gdp_multiple2 %>%
gather(3:60, key="year", value="gdp")

gdp_multiple2_melt$year <- as.numeric(gdp_multiple2_melt$year)

library(ggthemes)
library(ggplot2)
library (lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

options(scipen=999)


ggplot(gdp_multiple2_melt,aes(x=year,y=gdp, colour=`Country Name` ,group=`Country Name`)) + geom_line() + labs(title = "GDP from 1960 through 2007", caption = "Source: World Bank (2021)",x = "Year",y ="GDP") + theme_economist()

## Warning: Removed 15 row(s) containing missing values (geom_path).

library(ggthemes)
library(ggplot2)
library (lubridate)
options(scipen=999)


ggplot(gdp_multiple2_melt,aes(x=year,y=log1p(gdp), colour=`Country Name` ,group=`Country Name`)) + geom_line() + labs(title = "GDP from 1960 through 2007", caption = "Source: World Bank (2021)",x = "Year",y ="GDP") + theme_economist()

## Warning: Removed 15 row(s) containing missing values (geom_path).

Data frame primer

Kristen Sosulski

4/7/2022

Basics on Building dataframes and tibbles

Importing data

Accessing columns and rows

To do an unordered selection of columns and rows, use `c()`

Adding a new column way 1

Adding a new column way 2

Reordering data frame columns

Removing a column

Learning more about our data

Summary

Converting to log scale

Inverse

Log transformation

data frame

Visualize it

Let's apply this to a column in GDP

Subsetting

Single column - Multiple values

Transforming using the `gather` function.

Data frame primer

Kristen Sosulski

4/7/2022

Basics on Building dataframes and tibbles

Importing data

Accessing columns and rows

To do an unordered selection of columns and rows, use c()

Adding a new column way 1

Adding a new column way 2

Reordering data frame columns

Removing a column

Learning more about our data

Summary

Converting to log scale

Inverse

Log transformation

data frame

Visualize it

Let's apply this to a column in GDP

Subsetting

Single column - Multiple values

Transforming using the gather function.

To do an unordered selection of columns and rows, use `c()`

Transforming using the `gather` function.