2 - importing-data-from-statistical-software-packages
# import data from three software packages: SAS, STATA and SPSS.
#
# SAS: read_sas()
# STATA: read_dta() (or read_stata(), which are identical)
# SPSS: read_sav() or read_por(), depending on the file type.
url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/sales.sas7bdat"
##############################################
# cat("\014") # clear screen before next section
##############################################
# Import sales.sas7bdat: sales
# install.packages("haven")
library(haven)
sales <- read_sas(url)
# Display the structure of sales
str(sales)
## Classes 'tbl_df', 'tbl' and 'data.frame': 431 obs. of 4 variables:
## $ purchase: num 0 0 1 1 0 0 0 0 0 0 ...
## $ age : num 41 47 41 39 32 32 33 45 43 40 ...
## $ gender : chr "Female" "Female" "Female" "Female" ...
## $ income : chr "Low" "Low" "Low" "Low" ...
## - attr(*, "label")= chr "SALES"
##############################################
# cat("\014") # clear screen before next section
##############################################
# Import the data from the URL: sugar
# data on yearly import and export numbers of sugar, both in USD and in weight.
url = "http://assets.datacamp.com/production/course_1478/datasets/trade.dta"
sugar <- read_dta(url)
# Structure of sugar
# The Date column has class labelled.
str(sugar)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 5 variables:
## $ Date : 'labelled' num 10 9 8 7 6 5 4 3 2 1
## ..- attr(*, "label")= chr "Date"
## ..- attr(*, "format.stata")= chr "%9.0g"
## ..- attr(*, "labels")= Named num 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "2004-12-31" "2005-12-31" "2006-12-31" "2007-12-31" ...
## $ Import : num 37664782 16316512 11082246 35677943 9879878 ...
## ..- attr(*, "label")= chr "Import"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_I: num 54029106 21584365 14526089 55034932 14806865 ...
## ..- attr(*, "label")= chr "Weight_I"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Export : num 5.45e+07 1.03e+08 3.79e+07 4.85e+07 7.15e+07 ...
## ..- attr(*, "label")= chr "Export"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_E: num 9.34e+07 1.58e+08 8.80e+07 1.12e+08 1.32e+08 ...
## ..- attr(*, "label")= chr "Weight_E"
## ..- attr(*, "format.stata")= chr "%9.0g"
## - attr(*, "label")= chr "Written by R."
# Convert values in Date column to dates
sugar$Date <- as.Date(as_factor(sugar$Date))
# Structure of sugar again
str(sugar)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 5 variables:
## $ Date : Date, format: "2013-12-31" "2012-12-31" ...
## $ Import : num 37664782 16316512 11082246 35677943 9879878 ...
## ..- attr(*, "label")= chr "Import"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_I: num 54029106 21584365 14526089 55034932 14806865 ...
## ..- attr(*, "label")= chr "Weight_I"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Export : num 5.45e+07 1.03e+08 3.79e+07 4.85e+07 7.15e+07 ...
## ..- attr(*, "label")= chr "Export"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_E: num 9.34e+07 1.58e+08 8.80e+07 1.12e+08 1.32e+08 ...
## ..- attr(*, "label")= chr "Weight_E"
## ..- attr(*, "format.stata")= chr "%9.0g"
## - attr(*, "label")= chr "Written by R."
# corr = +ve
plot(sugar$Import, sugar$Weight_I)

# more sugar is traded, the higher the weight that's traded
##############################################
# cat("\014") # clear screen before next section
##############################################
# Import SPSS data with haven
library(haven)
# read_sav() - for .sav files
# read_por() - for .por files
# data on four of the Big Five personality traits for 434 persons
# (Source: University of Bath).
# http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
# http://staff.bath.ac.uk/pssiw/stats2/personality.sav
# Big Five is a psychological concept including, originally,
# five dimensions of personality to classify human personality.
# SPSS dataset is called person.sav
# http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/person.sav
# Import person.sav: traits
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/person.sav"
traits <- read_sav(url)
# Summarize traits
summary(traits) # shows how many NAs are contained in each variable
## Neurotic Extroversion Agreeableness Conscientiousness
## Min. : 0.00 Min. : 5.00 Min. :15.00 Min. : 7.00
## 1st Qu.:18.00 1st Qu.:26.00 1st Qu.:39.00 1st Qu.:25.00
## Median :24.00 Median :31.00 Median :45.00 Median :30.00
## Mean :23.63 Mean :30.23 Mean :44.55 Mean :30.85
## 3rd Qu.:29.00 3rd Qu.:34.00 3rd Qu.:50.00 3rd Qu.:36.00
## Max. :44.00 Max. :65.00 Max. :73.00 Max. :58.00
## NA's :14 NA's :16 NA's :19 NA's :14
# Print out a subset
# individuals that scored high on Extroversion and on Agreeableness,
# i.e. scoring higher than 40 on each of these two categories
subset(traits, Extroversion > 40 & Agreeableness > 40)
## # A tibble: 8 x 4
## Neurotic Extroversion Agreeableness Conscientiousness
## <dbl> <dbl> <dbl> <dbl>
## 1 38 43 49 29
## 2 20 42 46 31
## 3 18 42 49 31
## 4 42 43 44 29
## 5 30 42 51 24
## 6 18 42 50 25
## 7 27 45 55 23
## 8 18 43 57 34
##############################################
# cat("\014") # clear screen before next section
##############################################
# Import SPSS data from the URL: work
work <- read_sav("http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/employee.sav")
# information on employees and their demographic and economic attributes
# QRiE: http://cehd.gmu.edu/book/dimitrov/spss
# Display summary of work$GENDER
summary(work$GENDER)
## Length Class Mode
## 474 labelled character
# Convert work$GENDER to a factor
# factor = class to denote categorical variables in R
work$GENDER <- as_factor(work$GENDER)
# Display summary of work$GENDER again
summary(work$GENDER)
## Female Male
## 216 258
##############################################
# cat("\014") # clear screen before next section
##############################################
library(foreign)
# http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/florida.dta
# Source: Florida Department of State)
# http://results.elections.myflorida.com/
# US presidential elections in the year 2000
# total numbers of votes for each of the four candidates
# total number of votes per election area in the state of Florida
# Import florida.dta and name the resulting data frame florida
"florida.dta" <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/florida.dta"
# florida <- read.dta("florida.dta")
florida.dta <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/florida.dta"
florida <- read.dta(florida.dta)
# Check tail() of florida
tail(florida)
## gore bush buchanan nader total
## 62 2647 4051 27 59 6784
## 63 1399 2326 26 29 3780
## 64 97063 82214 396 2436 182109
## 65 3835 4511 46 149 8541
## 66 5637 12176 120 265 18198
## 67 2796 4983 88 93 7960
typeof(florida)
## [1] "list"
##############################################
# cat("\014") # clear screen before next section
##############################################
# socio-economic measures and access to education for different individuals
# http://datatopics.worldbank.org/Gender/topic/education
dir.create("worldbank")
## Warning in dir.create("worldbank"): 'worldbank' already exists
# with the following (i.e. without mode = "wb")
#
# download.file(url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/edequality.dta",
# destfile = "worldbank/edequality.dta")
#
# read.dta(path)
# returned the following error:
# Error in read.dta(path) : a binary read error occurred
# probably due to translated newlines
# mode "wb" = binary
download.file(url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/edequality.dta",
destfile = "worldbank/edequality.dta",
mode = "wb")
# Specify the file path using file.path(): path
path <- file.path("worldbank","edequality.dta")
# Create and print structure of edu_equal_1
edu_equal_1 <- read.dta(path)
str(edu_equal_1)
## 'data.frame': 12214 obs. of 27 variables:
## $ hhid : num 1 1 1 2 2 3 4 4 5 6 ...
## $ hhweight : num 627 627 627 627 627 ...
## $ location : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
## $ region : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
## $ ethnicity_head : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
## $ age : num 37 11 8 73 70 75 79 80 82 83 ...
## $ gender : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
## $ relation : Factor w/ 9 levels "head ",..: 1 3 3 1 2 1 1 2 1 1 ...
## $ literate : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ income_mnt : num 13.3 13.3 13.3 142.5 142.5 ...
## $ income : num 160 160 160 1710 1710 ...
## $ aggregate : num 1042 1042 1042 3271 3271 ...
## $ aggr_ind_annual : num 347 347 347 1635 1635 ...
## $ educ_completed : int 2 4 4 4 3 3 3 3 4 4 ...
## $ grade_complete : num 4 3 0 3 4 4 4 4 5 5 ...
## $ grade_all : num 4 11 8 11 8 8 8 8 13 13 ...
## $ unemployed : int 2 1 1 1 1 1 1 1 1 1 ...
## $ reason_OLF : int NA NA NA 3 3 3 9 9 3 3 ...
## $ sector : int NA NA NA NA NA NA 1 1 NA NA ...
## $ occupation : int NA NA NA NA NA NA 5 5 NA NA ...
## $ earn_mont : num 0 0 0 0 0 0 20 20 0 0 ...
## $ earn_ann : num 0 0 0 0 0 0 240 240 0 0 ...
## $ hours_week : num NA NA NA NA NA NA 30 35 NA NA ...
## $ hours_mnt : num NA NA NA NA NA ...
## $ fulltime : int NA NA NA NA NA NA 1 1 NA NA ...
## $ hhexp : num 100 100 100 343 343 ...
## $ legacy_pension_amt: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr ""
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 100 100 108 108 108 100 108 108 108 100 ...
## - attr(*, "val.labels")= chr "" "" "location" "region" ...
## - attr(*, "var.labels")= chr "hhid" "hhweight" "location" "region" ...
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr "_dta" "_svy_su1" "cluster"
## ..$ : chr "_dta" "_svy_strata1" "strata"
## ..$ : chr "_dta" "_svy_stages" "1"
## ..$ : chr "_dta" "_svy_version" "2"
## ..$ : chr "_dta" "__XijVarLabcons" "(sum) cons"
## ..$ : chr "_dta" "ReS_Xij" "cons"
## ..$ : chr "_dta" "ReS_str" "0"
## ..$ : chr "_dta" "ReS_j" "group"
## ..$ : chr "_dta" "ReS_ver" "v.2"
## ..$ : chr "_dta" "ReS_i" "hhid dur"
## ..$ : chr "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc, gall, health, rent, durables we"| __truncated__
## ..$ : chr "_dta" "note0" "1"
## - attr(*, "version")= int 7
## - attr(*, "label.table")=List of 12
## ..$ location: Named int 1 2
## .. ..- attr(*, "names")= chr "urban location" "rural location"
## ..$ region : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "Sofia city" "Bourgass" "Varna" "Lovetch" ...
## ..$ ethnic : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "Bulgaria" "Turks" "Roma" "Other"
## ..$ s2_q2 : Named int 1 2
## .. ..- attr(*, "names")= chr "male" "female"
## ..$ s2_q3 : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "head " "spouse/partner " "child " "son/daughter-in-law " ...
## ..$ lit : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## ..$ : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "never attanded" "primary" "secondary" "postsecondary"
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "Not unemployed" "Unemployed"
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "student" "housewife/childcare" "in retirement" "illness, disability" ...
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "agriculture" "mining" "manufacturing" "utilities" ...
## ..$ : Named int 1 2 3 4 5
## .. ..- attr(*, "names")= chr "private company" "public works program" "government,public sector, army" "private individual" ...
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
# Create and print structure of edu_equal_2
edu_equal_2 <- read.dta(path, convert.factors = FALSE)
str(edu_equal_2)
## 'data.frame': 12214 obs. of 27 variables:
## $ hhid : num 1 1 1 2 2 3 4 4 5 6 ...
## $ hhweight : num 627 627 627 627 627 ...
## $ location : int 1 1 1 1 1 2 2 2 1 1 ...
## $ region : int 8 8 8 9 9 4 4 4 8 8 ...
## $ ethnicity_head : int 2 2 2 1 1 1 1 1 1 1 ...
## $ age : num 37 11 8 73 70 75 79 80 82 83 ...
## $ gender : int 2 2 1 1 2 1 1 2 2 2 ...
## $ relation : int 1 3 3 1 2 1 1 2 1 1 ...
## $ literate : int 1 2 2 2 2 2 2 2 2 2 ...
## $ income_mnt : num 13.3 13.3 13.3 142.5 142.5 ...
## $ income : num 160 160 160 1710 1710 ...
## $ aggregate : num 1042 1042 1042 3271 3271 ...
## $ aggr_ind_annual : num 347 347 347 1635 1635 ...
## $ educ_completed : int 2 4 4 4 3 3 3 3 4 4 ...
## $ grade_complete : num 4 3 0 3 4 4 4 4 5 5 ...
## $ grade_all : num 4 11 8 11 8 8 8 8 13 13 ...
## $ unemployed : int 2 1 1 1 1 1 1 1 1 1 ...
## $ reason_OLF : int NA NA NA 3 3 3 9 9 3 3 ...
## $ sector : int NA NA NA NA NA NA 1 1 NA NA ...
## $ occupation : int NA NA NA NA NA NA 5 5 NA NA ...
## $ earn_mont : num 0 0 0 0 0 0 20 20 0 0 ...
## $ earn_ann : num 0 0 0 0 0 0 240 240 0 0 ...
## $ hours_week : num NA NA NA NA NA NA 30 35 NA NA ...
## $ hours_mnt : num NA NA NA NA NA ...
## $ fulltime : int NA NA NA NA NA NA 1 1 NA NA ...
## $ hhexp : num 100 100 100 343 343 ...
## $ legacy_pension_amt: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr ""
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 100 100 108 108 108 100 108 108 108 100 ...
## - attr(*, "val.labels")= chr "" "" "location" "region" ...
## - attr(*, "var.labels")= chr "hhid" "hhweight" "location" "region" ...
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr "_dta" "_svy_su1" "cluster"
## ..$ : chr "_dta" "_svy_strata1" "strata"
## ..$ : chr "_dta" "_svy_stages" "1"
## ..$ : chr "_dta" "_svy_version" "2"
## ..$ : chr "_dta" "__XijVarLabcons" "(sum) cons"
## ..$ : chr "_dta" "ReS_Xij" "cons"
## ..$ : chr "_dta" "ReS_str" "0"
## ..$ : chr "_dta" "ReS_j" "group"
## ..$ : chr "_dta" "ReS_ver" "v.2"
## ..$ : chr "_dta" "ReS_i" "hhid dur"
## ..$ : chr "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc, gall, health, rent, durables we"| __truncated__
## ..$ : chr "_dta" "note0" "1"
## - attr(*, "version")= int 7
## - attr(*, "label.table")=List of 12
## ..$ location: Named int 1 2
## .. ..- attr(*, "names")= chr "urban location" "rural location"
## ..$ region : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "Sofia city" "Bourgass" "Varna" "Lovetch" ...
## ..$ ethnic : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "Bulgaria" "Turks" "Roma" "Other"
## ..$ s2_q2 : Named int 1 2
## .. ..- attr(*, "names")= chr "male" "female"
## ..$ s2_q3 : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "head " "spouse/partner " "child " "son/daughter-in-law " ...
## ..$ lit : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## ..$ : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "never attanded" "primary" "secondary" "postsecondary"
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "Not unemployed" "Unemployed"
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "student" "housewife/childcare" "in retirement" "illness, disability" ...
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "agriculture" "mining" "manufacturing" "utilities" ...
## ..$ : Named int 1 2 3 4 5
## .. ..- attr(*, "names")= chr "private company" "public works program" "government,public sector, army" "private individual" ...
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
# Create and print structure of edu_equal_3
edu_equal_3 <- read.dta(path, convert.underscore = TRUE)
str(edu_equal_3)
## 'data.frame': 12214 obs. of 27 variables:
## $ hhid : num 1 1 1 2 2 3 4 4 5 6 ...
## $ hhweight : num 627 627 627 627 627 ...
## $ location : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
## $ region : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
## $ ethnicity.head : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
## $ age : num 37 11 8 73 70 75 79 80 82 83 ...
## $ gender : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
## $ relation : Factor w/ 9 levels "head ",..: 1 3 3 1 2 1 1 2 1 1 ...
## $ literate : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ income.mnt : num 13.3 13.3 13.3 142.5 142.5 ...
## $ income : num 160 160 160 1710 1710 ...
## $ aggregate : num 1042 1042 1042 3271 3271 ...
## $ aggr.ind.annual : num 347 347 347 1635 1635 ...
## $ educ.completed : int 2 4 4 4 3 3 3 3 4 4 ...
## $ grade.complete : num 4 3 0 3 4 4 4 4 5 5 ...
## $ grade.all : num 4 11 8 11 8 8 8 8 13 13 ...
## $ unemployed : int 2 1 1 1 1 1 1 1 1 1 ...
## $ reason.OLF : int NA NA NA 3 3 3 9 9 3 3 ...
## $ sector : int NA NA NA NA NA NA 1 1 NA NA ...
## $ occupation : int NA NA NA NA NA NA 5 5 NA NA ...
## $ earn.mont : num 0 0 0 0 0 0 20 20 0 0 ...
## $ earn.ann : num 0 0 0 0 0 0 240 240 0 0 ...
## $ hours.week : num NA NA NA NA NA NA 30 35 NA NA ...
## $ hours.mnt : num NA NA NA NA NA ...
## $ fulltime : int NA NA NA NA NA NA 1 1 NA NA ...
## $ hhexp : num 100 100 100 343 343 ...
## $ legacy.pension.amt: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr ""
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 100 100 108 108 108 100 108 108 108 100 ...
## - attr(*, "val.labels")= chr "" "" "location" "region" ...
## - attr(*, "var.labels")= chr "hhid" "hhweight" "location" "region" ...
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr "_dta" "_svy_su1" "cluster"
## ..$ : chr "_dta" "_svy_strata1" "strata"
## ..$ : chr "_dta" "_svy_stages" "1"
## ..$ : chr "_dta" "_svy_version" "2"
## ..$ : chr "_dta" "__XijVarLabcons" "(sum) cons"
## ..$ : chr "_dta" "ReS_Xij" "cons"
## ..$ : chr "_dta" "ReS_str" "0"
## ..$ : chr "_dta" "ReS_j" "group"
## ..$ : chr "_dta" "ReS_ver" "v.2"
## ..$ : chr "_dta" "ReS_i" "hhid dur"
## ..$ : chr "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc, gall, health, rent, durables we"| __truncated__
## ..$ : chr "_dta" "note0" "1"
## - attr(*, "version")= int 7
## - attr(*, "label.table")=List of 12
## ..$ location: Named int 1 2
## .. ..- attr(*, "names")= chr "urban location" "rural location"
## ..$ region : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "Sofia city" "Bourgass" "Varna" "Lovetch" ...
## ..$ ethnic : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "Bulgaria" "Turks" "Roma" "Other"
## ..$ s2_q2 : Named int 1 2
## .. ..- attr(*, "names")= chr "male" "female"
## ..$ s2_q3 : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "head " "spouse/partner " "child " "son/daughter-in-law " ...
## ..$ lit : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## ..$ : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "never attanded" "primary" "secondary" "postsecondary"
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "Not unemployed" "Unemployed"
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "student" "housewife/childcare" "in retirement" "illness, disability" ...
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "agriculture" "mining" "manufacturing" "utilities" ...
## ..$ : Named int 1 2 3 4 5
## .. ..- attr(*, "names")= chr "private company" "public works program" "government,public sector, army" "private individual" ...
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
# can see
# age is an integer
# literate is a factor, with the levels "yes" and "no".
str(edu_equal_1)
## 'data.frame': 12214 obs. of 27 variables:
## $ hhid : num 1 1 1 2 2 3 4 4 5 6 ...
## $ hhweight : num 627 627 627 627 627 ...
## $ location : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
## $ region : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
## $ ethnicity_head : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
## $ age : num 37 11 8 73 70 75 79 80 82 83 ...
## $ gender : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
## $ relation : Factor w/ 9 levels "head ",..: 1 3 3 1 2 1 1 2 1 1 ...
## $ literate : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ income_mnt : num 13.3 13.3 13.3 142.5 142.5 ...
## $ income : num 160 160 160 1710 1710 ...
## $ aggregate : num 1042 1042 1042 3271 3271 ...
## $ aggr_ind_annual : num 347 347 347 1635 1635 ...
## $ educ_completed : int 2 4 4 4 3 3 3 3 4 4 ...
## $ grade_complete : num 4 3 0 3 4 4 4 4 5 5 ...
## $ grade_all : num 4 11 8 11 8 8 8 8 13 13 ...
## $ unemployed : int 2 1 1 1 1 1 1 1 1 1 ...
## $ reason_OLF : int NA NA NA 3 3 3 9 9 3 3 ...
## $ sector : int NA NA NA NA NA NA 1 1 NA NA ...
## $ occupation : int NA NA NA NA NA NA 5 5 NA NA ...
## $ earn_mont : num 0 0 0 0 0 0 20 20 0 0 ...
## $ earn_ann : num 0 0 0 0 0 0 240 240 0 0 ...
## $ hours_week : num NA NA NA NA NA NA 30 35 NA NA ...
## $ hours_mnt : num NA NA NA NA NA ...
## $ fulltime : int NA NA NA NA NA NA 1 1 NA NA ...
## $ hhexp : num 100 100 100 343 343 ...
## $ legacy_pension_amt: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr ""
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 100 100 108 108 108 100 108 108 108 100 ...
## - attr(*, "val.labels")= chr "" "" "location" "region" ...
## - attr(*, "var.labels")= chr "hhid" "hhweight" "location" "region" ...
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr "_dta" "_svy_su1" "cluster"
## ..$ : chr "_dta" "_svy_strata1" "strata"
## ..$ : chr "_dta" "_svy_stages" "1"
## ..$ : chr "_dta" "_svy_version" "2"
## ..$ : chr "_dta" "__XijVarLabcons" "(sum) cons"
## ..$ : chr "_dta" "ReS_Xij" "cons"
## ..$ : chr "_dta" "ReS_str" "0"
## ..$ : chr "_dta" "ReS_j" "group"
## ..$ : chr "_dta" "ReS_ver" "v.2"
## ..$ : chr "_dta" "ReS_i" "hhid dur"
## ..$ : chr "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc, gall, health, rent, durables we"| __truncated__
## ..$ : chr "_dta" "note0" "1"
## - attr(*, "version")= int 7
## - attr(*, "label.table")=List of 12
## ..$ location: Named int 1 2
## .. ..- attr(*, "names")= chr "urban location" "rural location"
## ..$ region : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "Sofia city" "Bourgass" "Varna" "Lovetch" ...
## ..$ ethnic : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "Bulgaria" "Turks" "Roma" "Other"
## ..$ s2_q2 : Named int 1 2
## .. ..- attr(*, "names")= chr "male" "female"
## ..$ s2_q3 : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "head " "spouse/partner " "child " "son/daughter-in-law " ...
## ..$ lit : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## ..$ : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "never attanded" "primary" "secondary" "postsecondary"
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "Not unemployed" "Unemployed"
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "student" "housewife/childcare" "in retirement" "illness, disability" ...
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "agriculture" "mining" "manufacturing" "utilities" ...
## ..$ : Named int 1 2 3 4 5
## .. ..- attr(*, "names")= chr "private company" "public works program" "government,public sector, army" "private individual" ...
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
# how many observations (e.g. how many people) have an age higher than 40
# and are literate?
nrow(subset(edu_equal_1, age > 40 & literate == "yes"))
## [1] 6506
# how many observations/individuals
# from Bulgaria
# have an income above 1000?
nrow(subset(edu_equal_1, income > 1000 & ethnicity_head == "Bulgaria"))
## [1] 8997
##############################################
# cat("\014") # clear screen before next section
##############################################
# socio-economic variables from different countries
# Source: Quantative Data Analysis in Education
# http://cw.routledge.com/textbooks/9780415372985/resources/datasets.asp
# http://cw.routledge.com/textbooks/9780415372985/sav/international.sav
list.files()
## [1] "1 - importing-data-from-databases.R"
## [2] "1 - importing-data-from-databases.Rmd"
## [3] "1_-_importing-data-from-databases.html"
## [4] "2 - importing-data-from-statistical-software-packages.R"
## [5] "2 - importing-data-from-statistical-software-packages.Rmd"
## [6] "2_-_importing-data-from-statistical-software-packages.Rmd"
## [7] "3 - importing-data-from-web.R"
## [8] "importing_data_in_r_2_ch1.pdf"
## [9] "importing_data_in_r_2_ch2.pdf"
## [10] "importing_data_in_r_2_ch3.pdf"
## [11] "importing_data_in_r_2_ch4.pdf"
## [12] "importing_data_in_r_2_ch5.pdf"
## [13] "rsconnect"
## [14] "worldbank"
download.file(
"http://cw.routledge.com/textbooks/9780415372985/sav/international.sav",
destfile = "international.sav")
list.files()
## [1] "1 - importing-data-from-databases.R"
## [2] "1 - importing-data-from-databases.Rmd"
## [3] "1_-_importing-data-from-databases.html"
## [4] "2 - importing-data-from-statistical-software-packages.R"
## [5] "2 - importing-data-from-statistical-software-packages.Rmd"
## [6] "2_-_importing-data-from-statistical-software-packages.Rmd"
## [7] "3 - importing-data-from-web.R"
## [8] "importing_data_in_r_2_ch1.pdf"
## [9] "importing_data_in_r_2_ch2.pdf"
## [10] "importing_data_in_r_2_ch3.pdf"
## [11] "importing_data_in_r_2_ch4.pdf"
## [12] "importing_data_in_r_2_ch5.pdf"
## [13] "international.sav"
## [14] "rsconnect"
## [15] "worldbank"
download.file(
url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/international.sav",
destfile = "international.sav")
list.files()
## [1] "1 - importing-data-from-databases.R"
## [2] "1 - importing-data-from-databases.Rmd"
## [3] "1_-_importing-data-from-databases.html"
## [4] "2 - importing-data-from-statistical-software-packages.R"
## [5] "2 - importing-data-from-statistical-software-packages.Rmd"
## [6] "2_-_importing-data-from-statistical-software-packages.Rmd"
## [7] "3 - importing-data-from-web.R"
## [8] "importing_data_in_r_2_ch1.pdf"
## [9] "importing_data_in_r_2_ch2.pdf"
## [10] "importing_data_in_r_2_ch3.pdf"
## [11] "importing_data_in_r_2_ch4.pdf"
## [12] "importing_data_in_r_2_ch5.pdf"
## [13] "international.sav"
## [14] "rsconnect"
## [15] "worldbank"
# Import international.sav as a data frame: demo
demo <- read.spss("international.sav", to.data.frame = TRUE)
## re-encoding from CP1252
# Create boxplot of gdp variable of demo
boxplot(demo$gdp)

##############################################
# cat("\014") # clear screen before next section
##############################################
# Pearson's Correlation
#
# measurement to evaluate the linear dependency between two variables, say X and Y.
# It can range from -1 to 1;
# if it's close to 1
# it means that there is a strong positive association between the variables.
# If X is high, also Y tends to be high.
# If it's close to -1, there is a strong negative association:
# If X is high, Y tends to be low.
# When the Pearson correlation between two variables is 0,
# these variables are possibly independent:
# there is no association between X and Y.
#
# What is the correlation coefficient
# for the two numerical variables
# gdp and f_illit (female illiteracy rate)?
cor(demo$gdp, demo$f_illit)
## [1] -0.4476856
# That indicates a negative association among GDP and female illiteracy
##############################################
# cat("\014") # clear screen before next section
##############################################
# Import international.sav as demo_1
demo_1 <- read.spss("international.sav", to.data.frame = TRUE)
## re-encoding from CP1252
# Print out the head of demo_1
head(demo_1)
## id country contint m_illit f_illit lifeexpt gdp
## 1 1 Argentina Americas 3.0 3.0 16 3375
## 2 2 Benin Africa 45.2 74.5 7 521
## 3 3 Burundi Africa 33.2 48.1 5 86
## 4 4 Chile Americas 4.2 4.4 14 4523
## 5 5 Dominican Republic Americas 12.0 12.7 12 2408
## 6 6 El Salvador Americas 17.6 22.9 11 2302
# Import international.sav as demo_2
# variables with value labels are NOT converted to R factors
demo_2 <- read.spss("international.sav", to.data.frame = TRUE, use.value.labels = FALSE)
## re-encoding from CP1252
# Print out the head of demo_2
head(demo_2)
## id country contint m_illit f_illit lifeexpt gdp
## 1 1 Argentina 2 3.0 3.0 16 3375
## 2 2 Benin 1 45.2 74.5 7 521
## 3 3 Burundi 1 33.2 48.1 5 86
## 4 4 Chile 2 4.2 4.4 14 4523
## 5 5 Dominican Republic 2 12.0 12.7 12 2408
## 6 6 El Salvador 2 17.6 22.9 11 2302