nordic data: https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/nordic-data.csv

nordic data 2: https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/nordic-data-2.csv

gapminder data:https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/gapminder_data.csv

Part 1: Data Structures

###lets go get some data and explore it using what we've learned so far
## https://datacarpentry.org/r-intro-geospatial/02-project-intro/index.html
###copy this link into zoom chat for people to copy
nordic <- 
  read.csv("https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/nordic-data.csv") 
#nordic <- read.csv('data/nordic')

#View(nordic)
nordic$country
## [1] "Denmark" "Sweden"  "Norway"
nordic$lifeExp
## [1] 77.2 80.0 79.0
nordic$lifeExp + 2
## [1] 79.2 82.0 81.0
#nordic$lifeExp + nordic$country

class(nordic$lifeExp)
## [1] "numeric"
#Exercise: What kind of objects are the other columns?
class(nordic$year)
## [1] "integer"
class(nordic$country)
## [1] "character"
#There are 6 main types: numeric, integer, complex, logical, character, and factor.
class(1)
## [1] "numeric"
class(1L)  ##force integer type
## [1] "integer"
class(T)
## [1] "logical"
class('banana')  ##remember quotes
## [1] "character"
#data type rules can be useful check to make sure data is all of one type.
#can also be a pain.
#nordic2 <- read.csv('data/nordic')
nordic_2 <- 
  read.csv("https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/nordic-data-2.csv") 
class(nordic_2$lifeExp)
## [1] "character"
class(nordic$lifeExp)
## [1] "numeric"
#how to look at nordic_2??
#View(nordic_2)

class(nordic)
## [1] "data.frame"
str(nordic)
## 'data.frame':    3 obs. of  3 variables:
##  $ country: chr  "Denmark" "Sweden" "Norway"
##  $ year   : int  2002 2002 2002
##  $ lifeExp: num  77.2 80 79
#explore different types of data structures
##first up: vectors

my_vector <- vector(length=3)
my_vector  ##elements in vector must be the same type
## [1] FALSE FALSE FALSE
another_vector <- vector(mode='character', length=3)
another_vector
## [1] "" "" ""
str(another_vector)  #character vector
##  chr [1:3] "" "" ""
#combine function to make vectors
combine_vector <- c(2, 6, 3)
str(combine_vector)
##  num [1:3] 2 6 3
#Exercise: what kind of vector will this be?
quiz_vector <- c(2, 6, '3')
str(quiz_vector)
##  chr [1:3] "2" "6" "3"
#quiz_vector + 2
#R stops us from doing something bad, but sometimes types make R do bad stuff.
#Example
coercion_vector <- c('a', TRUE)
coercion_vector  ###logic item was converted to character
## [1] "a"    "TRUE"
another_coercion_vector <- c(0, TRUE)
another_coercion_vector  #logic was converted to numeric
## [1] 0 1
#coercion order of operatons: logical -> integer -> numeric -> complex -> character

#R lets us coerce elements
character_vector_example <- c('0', '2', '4')
str(character_vector_example)
##  chr [1:3] "0" "2" "4"
character_coerced_to_numeric <- as.numeric(character_vector_example)
str(character_coerced_to_numeric)
##  num [1:3] 0 2 4
numeric_coerced_to_logical <- as.logical(character_coerced_to_numeric)
str(numeric_coerced_to_logical)
##  logi [1:3] FALSE TRUE TRUE
#use combine to join things
ab_vector <- c('a', 'b')
ab_vector
## [1] "a" "b"
combine_example <- c(ab_vector, 'DC')
combine_example
## [1] "a"  "b"  "DC"
#shortcut to make vector of numbers
sequence_example <- seq(10)
head(sequence_example,n = 2)
## [1] 1 2
tail(sequence_example, n = 4)
## [1]  7  8  9 10
length(sequence_example)
## [1] 10
str(sequence_example)
##  int [1:10] 1 2 3 4 5 6 7 8 9 10
##can name elements in a vector
names(sequence_example) <- c('a','b','c','d','e',rep('x',5))
sequence_example
##  a  b  c  d  e  x  x  x  x  x 
##  1  2  3  4  5  6  7  8  9 10
#Factors
#factors are much like other types but have categorical properties
str(nordic$country)
##  chr [1:3] "Denmark" "Sweden" "Norway"
#lets explore the factor data structure
nordic_countries <- c('Norway', 'Finland', 'Denmark', 'Iceland', 'Sweden')
nordic_countries
## [1] "Norway"  "Finland" "Denmark" "Iceland" "Sweden"
#what is the structure type??
categories <- factor(nordic_countries)
str(categories)  ##by default they are sorted in alphabetical order
##  Factor w/ 5 levels "Denmark","Finland",..: 4 2 1 3 5
categories <- factor(nordic_countries, levels=c('Norway', 'Finland', 'Denmark', 'Iceland', 'Sweden'))
str(categories)
##  Factor w/ 5 levels "Norway","Finland",..: 1 2 3 4 5
#Lists
#elements don't have to be the same type
list_example <- list(1, "a", TRUE, c(2, 6, 7))
str(list_example)
## List of 4
##  $ : num 1
##  $ : chr "a"
##  $ : logi TRUE
##  $ : num [1:3] 2 6 7
another_list <- list(title = "Numbers", numbers = 1:10, data = TRUE )
str(another_list)
## List of 3
##  $ title  : chr "Numbers"
##  $ numbers: int [1:10] 1 2 3 4 5 6 7 8 9 10
##  $ data   : logi TRUE
#data frames are lists but each element must be the same length

#different ways to access elements in lists or data.frame
nordic
##   country year lifeExp
## 1 Denmark 2002    77.2
## 2  Sweden 2002    80.0
## 3  Norway 2002    79.0
nordic$country
## [1] "Denmark" "Sweden"  "Norway"
nordic[,1]
## [1] "Denmark" "Sweden"  "Norway"
nordic[1]
##   country
## 1 Denmark
## 2  Sweden
## 3  Norway
nordic[[1]]
## [1] "Denmark" "Sweden"  "Norway"
nordic[,'country']
## [1] "Denmark" "Sweden"  "Norway"
nordic['country']
##   country
## 1 Denmark
## 2  Sweden
## 3  Norway
nordic[1,1]
## [1] "Denmark"
nordic[1,]$country
## [1] "Denmark"
#key points
#read.csv or read.table or read.delim (I also use read_xlsx)
#pay attention to data types

Part 2: R Exploring Data Frames

https://datacarpentry.org/r-intro-geospatial/04-data-structures-part2/index.html

#gapminder <- read.csv('data/gapminder.csv')
gapminder <- read.csv("https://raw.githubusercontent.com/datacarpentry/r-intro-geospatial/master/_episodes_rmd/data/gapminder_data.csv") 

#ways to look at data
str(gapminder)  ##we learned this already
## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
nrow(gapminder)
## [1] 1704
ncol(gapminder)
## [1] 6
dim(gapminder)  
## [1] 1704    6
colnames(gapminder)
## [1] "country"   "year"      "pop"       "continent" "lifeExp"   "gdpPercap"
head(gapminder)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134
###try tail(). What does this give you??
##answer in zoom
tail(gapminder)
##       country year      pop continent lifeExp gdpPercap
## 1699 Zimbabwe 1982  7636524    Africa  60.363  788.8550
## 1700 Zimbabwe 1987  9216418    Africa  62.351  706.1573
## 1701 Zimbabwe 1992 10704340    Africa  60.377  693.4208
## 1702 Zimbabwe 1997 11404948    Africa  46.809  792.4500
## 1703 Zimbabwe 2002 11926563    Africa  39.989  672.0386
## 1704 Zimbabwe 2007 12311143    Africa  43.487  469.7093
#View(gapminder)


#Challenge: Show me a command that gives lines in the middle of gapminder
tail(head(gapminder, 500))
##     country year     pop continent lifeExp gdpPercap
## 495 Eritrea 1962 1666618    Africa  40.158  380.9958
## 496 Eritrea 1967 1820319    Africa  42.189  468.7950
## 497 Eritrea 1972 2260187    Africa  44.142  514.3242
## 498 Eritrea 1977 2512642    Africa  44.535  505.7538
## 499 Eritrea 1982 2637297    Africa  43.890  524.8758
## 500 Eritrea 1987 2915959    Africa  46.453  521.1341
gapminder[495:500,]
##     country year     pop continent lifeExp gdpPercap
## 495 Eritrea 1962 1666618    Africa  40.158  380.9958
## 496 Eritrea 1967 1820319    Africa  42.189  468.7950
## 497 Eritrea 1972 2260187    Africa  44.142  514.3242
## 498 Eritrea 1977 2512642    Africa  44.535  505.7538
## 499 Eritrea 1982 2637297    Africa  43.890  524.8758
## 500 Eritrea 1987 2915959    Africa  46.453  521.1341
#more tips for checking out data
table(gapminder$continent)
## 
##   Africa Americas     Asia   Europe  Oceania 
##      624      300      396      360       24
summary(gapminder)
##    country               year           pop             continent        
##  Length:1704        Min.   :1952   Min.   :6.001e+04   Length:1704       
##  Class :character   1st Qu.:1966   1st Qu.:2.794e+06   Class :character  
##  Mode  :character   Median :1980   Median :7.024e+06   Mode  :character  
##                     Mean   :1980   Mean   :2.960e+07                     
##                     3rd Qu.:1993   3rd Qu.:1.959e+07                     
##                     Max.   :2007   Max.   :1.319e+09                     
##     lifeExp        gdpPercap       
##  Min.   :23.60   Min.   :   241.2  
##  1st Qu.:48.20   1st Qu.:  1202.1  
##  Median :60.71   Median :  3531.8  
##  Mean   :59.47   Mean   :  7215.3  
##  3rd Qu.:70.85   3rd Qu.:  9325.5  
##  Max.   :82.60   Max.   :113523.1
##adding columns and rows to data frames

#We would like to create a new column to hold information on whether the life expectancy is below the world average life expectancy (70.5) or above:

below_average <- gapminder$lifeExp < 70.5  ##do some basic evaluation
str(below_average) #output is a vector of logical values
##  logi [1:1704] TRUE TRUE TRUE TRUE TRUE TRUE ...
nrow(gapminder)  ##do the number of rows match? why is this important??
## [1] 1704
table(below_average)  
## below_average
## FALSE  TRUE 
##   461  1243
#cbind(gapminder, below_average)
head(cbind(gapminder, below_average))
##       country year      pop continent lifeExp gdpPercap below_average
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453          TRUE
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530          TRUE
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007          TRUE
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971          TRUE
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811          TRUE
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134          TRUE
head(cbind(gapminder, gapminder$lifeExp < 70.5)) ##same output, but name is different
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134
##   gapminder$lifeExp < 70.5
## 1                     TRUE
## 2                     TRUE
## 3                     TRUE
## 4                     TRUE
## 5                     TRUE
## 6                     TRUE
gapminder2 <- cbind(gapminder, below_average)  ##make new data with appended column
head(gapminder2)
##       country year      pop continent lifeExp gdpPercap below_average
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453          TRUE
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530          TRUE
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007          TRUE
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971          TRUE
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811          TRUE
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134          TRUE
#gapminder <- gapminder2

###with data frames, this may be easier
gapminder$below_average <- gapminder$lifeExp < 70.5
head(gapminder)
##       country year      pop continent lifeExp gdpPercap below_average
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453          TRUE
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530          TRUE
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007          TRUE
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971          TRUE
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811          TRUE
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134          TRUE
##add some rows
##this trickier because of all the different types of vectors that each row has
str(gapminder[1,])
## 'data.frame':    1 obs. of  7 variables:
##  $ country      : chr "Afghanistan"
##  $ year         : int 1952
##  $ pop          : num 8425333
##  $ continent    : chr "Asia"
##  $ lifeExp      : num 28.8
##  $ gdpPercap    : num 779
##  $ below_average: logi TRUE
new_row <- list('Norway', 2016, 5000000, 'Nordic', 80.3, 49400.0, FALSE)

#Challenge: write a line to code that checks if the dimensions of the new_row will work with the gapminder data frame.
ncol(gapminder) == length(new_row)
## [1] TRUE
gapminder_norway <- rbind(gapminder, new_row)
tail(gapminder_norway)
##       country year      pop continent lifeExp  gdpPercap below_average
## 1700 Zimbabwe 1987  9216418    Africa  62.351   706.1573          TRUE
## 1701 Zimbabwe 1992 10704340    Africa  60.377   693.4208          TRUE
## 1702 Zimbabwe 1997 11404948    Africa  46.809   792.4500          TRUE
## 1703 Zimbabwe 2002 11926563    Africa  39.989   672.0386          TRUE
## 1704 Zimbabwe 2007 12311143    Africa  43.487   469.7093          TRUE
## 1705   Norway 2016  5000000    Nordic  80.300 49400.0000         FALSE
#factors
str(gapminder$continent)
##  chr [1:1704] "Asia" "Asia" "Asia" "Asia" "Asia" "Asia" "Asia" "Asia" ...
#change columns to factors
gapminder$continent <- factor(gapminder$continent)
#gapminder_norway <- rbind(gapminder, new_row)
tail(gapminder_norway)
##       country year      pop continent lifeExp  gdpPercap below_average
## 1700 Zimbabwe 1987  9216418    Africa  62.351   706.1573          TRUE
## 1701 Zimbabwe 1992 10704340    Africa  60.377   693.4208          TRUE
## 1702 Zimbabwe 1997 11404948    Africa  46.809   792.4500          TRUE
## 1703 Zimbabwe 2002 11926563    Africa  39.989   672.0386          TRUE
## 1704 Zimbabwe 2007 12311143    Africa  43.487   469.7093          TRUE
## 1705   Norway 2016  5000000    Nordic  80.300 49400.0000         FALSE
levels(gapminder$continent) <- c(levels(gapminder$continent), "Nordic")
gapminder_norway <- rbind(gapminder, new_row)
tail(gapminder_norway)
##       country year      pop continent lifeExp  gdpPercap below_average
## 1700 Zimbabwe 1987  9216418    Africa  62.351   706.1573          TRUE
## 1701 Zimbabwe 1992 10704340    Africa  60.377   693.4208          TRUE
## 1702 Zimbabwe 1997 11404948    Africa  46.809   792.4500          TRUE
## 1703 Zimbabwe 2002 11926563    Africa  39.989   672.0386          TRUE
## 1704 Zimbabwe 2007 12311143    Africa  43.487   469.7093          TRUE
## 1705   Norway 2016  5000000    Nordic  80.300 49400.0000         FALSE
dim(gapminder)
## [1] 1704    7
dim(rbind(gapminder, gapminder))  #paste together dataframes
## [1] 3408    7
dim(cbind(gapminder, gapminder))
## [1] 1704   14
#do not have to import data frames from csv. Can make them in R
data_frame <- data.frame(title = c('a', 'b', 'c'), numbers = 1:3, data = c(T,T,F))
str(data_frame)
## 'data.frame':    3 obs. of  3 variables:
##  $ title  : chr  "a" "b" "c"
##  $ numbers: int  1 2 3
##  $ data   : logi  TRUE TRUE FALSE