2 - importing-data-from-statistical-software-packages

# import data from three software packages: SAS, STATA and SPSS.
#
# SAS: read_sas()
# STATA: read_dta() (or read_stata(), which are identical)
# SPSS: read_sav() or read_por(), depending on the file type.
url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/sales.sas7bdat"

##############################################
# cat("\014") # clear screen before next section
##############################################

# Import sales.sas7bdat: sales
# install.packages("haven")
library(haven)
sales <- read_sas(url)

# Display the structure of sales
str(sales)
## Classes 'tbl_df', 'tbl' and 'data.frame':    431 obs. of  4 variables:
##  $ purchase: num  0 0 1 1 0 0 0 0 0 0 ...
##  $ age     : num  41 47 41 39 32 32 33 45 43 40 ...
##  $ gender  : chr  "Female" "Female" "Female" "Female" ...
##  $ income  : chr  "Low" "Low" "Low" "Low" ...
##  - attr(*, "label")= chr "SALES"

##############################################
# cat("\014") # clear screen before next section
##############################################

# Import the data from the URL: sugar
# data on yearly import and export numbers of sugar, both in USD and in weight. 
url = "http://assets.datacamp.com/production/course_1478/datasets/trade.dta"
sugar <- read_dta(url)

# Structure of sugar
# The Date column has class labelled.
str(sugar)
## Classes 'tbl_df', 'tbl' and 'data.frame':    10 obs. of  5 variables:
##  $ Date    : 'labelled' num  10 9 8 7 6 5 4 3 2 1
##   ..- attr(*, "label")= chr "Date"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##   ..- attr(*, "labels")= Named num  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "2004-12-31" "2005-12-31" "2006-12-31" "2007-12-31" ...
##  $ Import  : num  37664782 16316512 11082246 35677943 9879878 ...
##   ..- attr(*, "label")= chr "Import"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ Weight_I: num  54029106 21584365 14526089 55034932 14806865 ...
##   ..- attr(*, "label")= chr "Weight_I"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ Export  : num  5.45e+07 1.03e+08 3.79e+07 4.85e+07 7.15e+07 ...
##   ..- attr(*, "label")= chr "Export"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ Weight_E: num  9.34e+07 1.58e+08 8.80e+07 1.12e+08 1.32e+08 ...
##   ..- attr(*, "label")= chr "Weight_E"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  - attr(*, "label")= chr "Written by R."

# Convert values in Date column to dates
sugar$Date <- as.Date(as_factor(sugar$Date))

# Structure of sugar again
str(sugar)
## Classes 'tbl_df', 'tbl' and 'data.frame':    10 obs. of  5 variables:
##  $ Date    : Date, format: "2013-12-31" "2012-12-31" ...
##  $ Import  : num  37664782 16316512 11082246 35677943 9879878 ...
##   ..- attr(*, "label")= chr "Import"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ Weight_I: num  54029106 21584365 14526089 55034932 14806865 ...
##   ..- attr(*, "label")= chr "Weight_I"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ Export  : num  5.45e+07 1.03e+08 3.79e+07 4.85e+07 7.15e+07 ...
##   ..- attr(*, "label")= chr "Export"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ Weight_E: num  9.34e+07 1.58e+08 8.80e+07 1.12e+08 1.32e+08 ...
##   ..- attr(*, "label")= chr "Weight_E"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  - attr(*, "label")= chr "Written by R."

# corr = +ve
plot(sugar$Import, sugar$Weight_I)

# more sugar is traded, the higher the weight that's traded

##############################################
# cat("\014") # clear screen before next section
##############################################

# Import SPSS data with haven
library(haven)

# read_sav() - for .sav files 
# read_por() - for .por files

# data on four of the Big Five personality traits for 434 persons 
# (Source: University of Bath). 
# http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
# http://staff.bath.ac.uk/pssiw/stats2/personality.sav
# Big Five is a psychological concept including, originally, 
# five dimensions of personality to classify human personality. 
# SPSS dataset is called person.sav
# http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/person.sav

# Import person.sav: traits
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/person.sav"
traits <- read_sav(url)

# Summarize traits
summary(traits) # shows how many NAs are contained in each variable
##     Neurotic      Extroversion   Agreeableness   Conscientiousness
##  Min.   : 0.00   Min.   : 5.00   Min.   :15.00   Min.   : 7.00    
##  1st Qu.:18.00   1st Qu.:26.00   1st Qu.:39.00   1st Qu.:25.00    
##  Median :24.00   Median :31.00   Median :45.00   Median :30.00    
##  Mean   :23.63   Mean   :30.23   Mean   :44.55   Mean   :30.85    
##  3rd Qu.:29.00   3rd Qu.:34.00   3rd Qu.:50.00   3rd Qu.:36.00    
##  Max.   :44.00   Max.   :65.00   Max.   :73.00   Max.   :58.00    
##  NA's   :14      NA's   :16      NA's   :19      NA's   :14

# Print out a subset
# individuals that scored high on Extroversion and on Agreeableness, 
# i.e. scoring higher than 40 on each of these two categories
subset(traits, Extroversion > 40 & Agreeableness > 40)
## # A tibble: 8 x 4
##   Neurotic Extroversion Agreeableness Conscientiousness
##      <dbl>        <dbl>         <dbl>             <dbl>
## 1       38           43            49                29
## 2       20           42            46                31
## 3       18           42            49                31
## 4       42           43            44                29
## 5       30           42            51                24
## 6       18           42            50                25
## 7       27           45            55                23
## 8       18           43            57                34

##############################################
# cat("\014") # clear screen before next section
##############################################

# Import SPSS data from the URL: work
work <- read_sav("http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/employee.sav")
# information on employees and their demographic and economic attributes
# QRiE: http://cehd.gmu.edu/book/dimitrov/spss

# Display summary of work$GENDER
summary(work$GENDER)
##    Length     Class      Mode 
##       474  labelled character


# Convert work$GENDER to a factor
# factor = class to denote categorical variables in R
work$GENDER <- as_factor(work$GENDER)

# Display summary of work$GENDER again
summary(work$GENDER)
## Female   Male 
##    216    258

##############################################
# cat("\014") # clear screen before next section
##############################################

library(foreign)

# http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/florida.dta
# Source: Florida Department of State)
# http://results.elections.myflorida.com/
# US presidential elections in the year 2000
# total numbers of votes for each of the four candidates
# total number of votes per election area in the state of Florida

# Import florida.dta and name the resulting data frame florida
"florida.dta" <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/florida.dta"
# florida <- read.dta("florida.dta")
florida.dta <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/florida.dta"
florida <- read.dta(florida.dta)

# Check tail() of florida
tail(florida)
##     gore  bush buchanan nader  total
## 62  2647  4051       27    59   6784
## 63  1399  2326       26    29   3780
## 64 97063 82214      396  2436 182109
## 65  3835  4511       46   149   8541
## 66  5637 12176      120   265  18198
## 67  2796  4983       88    93   7960

typeof(florida)
## [1] "list"

##############################################
# cat("\014") # clear screen before next section
##############################################

# socio-economic measures and access to education for different individuals
# http://datatopics.worldbank.org/Gender/topic/education
dir.create("worldbank")
## Warning in dir.create("worldbank"): 'worldbank' already exists

# with the following (i.e. without mode = "wb")
# 
# download.file(url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/edequality.dta",
#               destfile = "worldbank/edequality.dta")
#               
# read.dta(path)
# returned the following error:
# Error in read.dta(path) : a binary read error occurred
# probably due to translated newlines
# mode "wb" = binary

download.file(url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/edequality.dta",
              destfile = "worldbank/edequality.dta",
              mode = "wb")

# Specify the file path using file.path(): path
path <- file.path("worldbank","edequality.dta")

# Create and print structure of edu_equal_1
edu_equal_1 <- read.dta(path)
str(edu_equal_1)
## 'data.frame':    12214 obs. of  27 variables:
##  $ hhid              : num  1 1 1 2 2 3 4 4 5 6 ...
##  $ hhweight          : num  627 627 627 627 627 ...
##  $ location          : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
##  $ region            : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
##  $ ethnicity_head    : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
##  $ age               : num  37 11 8 73 70 75 79 80 82 83 ...
##  $ gender            : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
##  $ relation          : Factor w/ 9 levels "head                      ",..: 1 3 3 1 2 1 1 2 1 1 ...
##  $ literate          : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ income_mnt        : num  13.3 13.3 13.3 142.5 142.5 ...
##  $ income            : num  160 160 160 1710 1710 ...
##  $ aggregate         : num  1042 1042 1042 3271 3271 ...
##  $ aggr_ind_annual   : num  347 347 347 1635 1635 ...
##  $ educ_completed    : int  2 4 4 4 3 3 3 3 4 4 ...
##  $ grade_complete    : num  4 3 0 3 4 4 4 4 5 5 ...
##  $ grade_all         : num  4 11 8 11 8 8 8 8 13 13 ...
##  $ unemployed        : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ reason_OLF        : int  NA NA NA 3 3 3 9 9 3 3 ...
##  $ sector            : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ occupation        : int  NA NA NA NA NA NA 5 5 NA NA ...
##  $ earn_mont         : num  0 0 0 0 0 0 20 20 0 0 ...
##  $ earn_ann          : num  0 0 0 0 0 0 240 240 0 0 ...
##  $ hours_week        : num  NA NA NA NA NA NA 30 35 NA NA ...
##  $ hours_mnt         : num  NA NA NA NA NA ...
##  $ fulltime          : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ hhexp             : num  100 100 100 343 343 ...
##  $ legacy_pension_amt: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr ""
##  - attr(*, "formats")= chr  "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
##  - attr(*, "types")= int  100 100 108 108 108 100 108 108 108 100 ...
##  - attr(*, "val.labels")= chr  "" "" "location" "region" ...
##  - attr(*, "var.labels")= chr  "hhid" "hhweight" "location" "region" ...
##  - attr(*, "expansion.fields")=List of 12
##   ..$ : chr  "_dta" "_svy_su1" "cluster"
##   ..$ : chr  "_dta" "_svy_strata1" "strata"
##   ..$ : chr  "_dta" "_svy_stages" "1"
##   ..$ : chr  "_dta" "_svy_version" "2"
##   ..$ : chr  "_dta" "__XijVarLabcons" "(sum) cons"
##   ..$ : chr  "_dta" "ReS_Xij" "cons"
##   ..$ : chr  "_dta" "ReS_str" "0"
##   ..$ : chr  "_dta" "ReS_j" "group"
##   ..$ : chr  "_dta" "ReS_ver" "v.2"
##   ..$ : chr  "_dta" "ReS_i" "hhid dur"
##   ..$ : chr  "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc,  gall, health, rent, durables we"| __truncated__
##   ..$ : chr  "_dta" "note0" "1"
##  - attr(*, "version")= int 7
##  - attr(*, "label.table")=List of 12
##   ..$ location: Named int  1 2
##   .. ..- attr(*, "names")= chr  "urban location" "rural location"
##   ..$ region  : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "Sofia city" "Bourgass" "Varna" "Lovetch" ...
##   ..$ ethnic  : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "Bulgaria" "Turks" "Roma" "Other"
##   ..$ s2_q2   : Named int  1 2
##   .. ..- attr(*, "names")= chr  "male" "female"
##   ..$ s2_q3   : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "head                      " "spouse/partner            " "child                     " "son/daughter-in-law       " ...
##   ..$ lit     : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"
##   ..$         : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "never attanded" "primary" "secondary" "postsecondary"
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "Not unemployed" "Unemployed"
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "student" "housewife/childcare" "in retirement" "illness, disability" ...
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "agriculture" "mining" "manufacturing" "utilities" ...
##   ..$         : Named int  1 2 3 4 5
##   .. ..- attr(*, "names")= chr  "private company" "public works program" "government,public sector, army" "private individual" ...
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"

# Create and print structure of edu_equal_2
edu_equal_2 <- read.dta(path, convert.factors = FALSE)
str(edu_equal_2)
## 'data.frame':    12214 obs. of  27 variables:
##  $ hhid              : num  1 1 1 2 2 3 4 4 5 6 ...
##  $ hhweight          : num  627 627 627 627 627 ...
##  $ location          : int  1 1 1 1 1 2 2 2 1 1 ...
##  $ region            : int  8 8 8 9 9 4 4 4 8 8 ...
##  $ ethnicity_head    : int  2 2 2 1 1 1 1 1 1 1 ...
##  $ age               : num  37 11 8 73 70 75 79 80 82 83 ...
##  $ gender            : int  2 2 1 1 2 1 1 2 2 2 ...
##  $ relation          : int  1 3 3 1 2 1 1 2 1 1 ...
##  $ literate          : int  1 2 2 2 2 2 2 2 2 2 ...
##  $ income_mnt        : num  13.3 13.3 13.3 142.5 142.5 ...
##  $ income            : num  160 160 160 1710 1710 ...
##  $ aggregate         : num  1042 1042 1042 3271 3271 ...
##  $ aggr_ind_annual   : num  347 347 347 1635 1635 ...
##  $ educ_completed    : int  2 4 4 4 3 3 3 3 4 4 ...
##  $ grade_complete    : num  4 3 0 3 4 4 4 4 5 5 ...
##  $ grade_all         : num  4 11 8 11 8 8 8 8 13 13 ...
##  $ unemployed        : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ reason_OLF        : int  NA NA NA 3 3 3 9 9 3 3 ...
##  $ sector            : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ occupation        : int  NA NA NA NA NA NA 5 5 NA NA ...
##  $ earn_mont         : num  0 0 0 0 0 0 20 20 0 0 ...
##  $ earn_ann          : num  0 0 0 0 0 0 240 240 0 0 ...
##  $ hours_week        : num  NA NA NA NA NA NA 30 35 NA NA ...
##  $ hours_mnt         : num  NA NA NA NA NA ...
##  $ fulltime          : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ hhexp             : num  100 100 100 343 343 ...
##  $ legacy_pension_amt: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr ""
##  - attr(*, "formats")= chr  "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
##  - attr(*, "types")= int  100 100 108 108 108 100 108 108 108 100 ...
##  - attr(*, "val.labels")= chr  "" "" "location" "region" ...
##  - attr(*, "var.labels")= chr  "hhid" "hhweight" "location" "region" ...
##  - attr(*, "expansion.fields")=List of 12
##   ..$ : chr  "_dta" "_svy_su1" "cluster"
##   ..$ : chr  "_dta" "_svy_strata1" "strata"
##   ..$ : chr  "_dta" "_svy_stages" "1"
##   ..$ : chr  "_dta" "_svy_version" "2"
##   ..$ : chr  "_dta" "__XijVarLabcons" "(sum) cons"
##   ..$ : chr  "_dta" "ReS_Xij" "cons"
##   ..$ : chr  "_dta" "ReS_str" "0"
##   ..$ : chr  "_dta" "ReS_j" "group"
##   ..$ : chr  "_dta" "ReS_ver" "v.2"
##   ..$ : chr  "_dta" "ReS_i" "hhid dur"
##   ..$ : chr  "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc,  gall, health, rent, durables we"| __truncated__
##   ..$ : chr  "_dta" "note0" "1"
##  - attr(*, "version")= int 7
##  - attr(*, "label.table")=List of 12
##   ..$ location: Named int  1 2
##   .. ..- attr(*, "names")= chr  "urban location" "rural location"
##   ..$ region  : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "Sofia city" "Bourgass" "Varna" "Lovetch" ...
##   ..$ ethnic  : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "Bulgaria" "Turks" "Roma" "Other"
##   ..$ s2_q2   : Named int  1 2
##   .. ..- attr(*, "names")= chr  "male" "female"
##   ..$ s2_q3   : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "head                      " "spouse/partner            " "child                     " "son/daughter-in-law       " ...
##   ..$ lit     : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"
##   ..$         : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "never attanded" "primary" "secondary" "postsecondary"
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "Not unemployed" "Unemployed"
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "student" "housewife/childcare" "in retirement" "illness, disability" ...
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "agriculture" "mining" "manufacturing" "utilities" ...
##   ..$         : Named int  1 2 3 4 5
##   .. ..- attr(*, "names")= chr  "private company" "public works program" "government,public sector, army" "private individual" ...
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"

# Create and print structure of edu_equal_3
edu_equal_3 <- read.dta(path, convert.underscore = TRUE)
str(edu_equal_3)
## 'data.frame':    12214 obs. of  27 variables:
##  $ hhid              : num  1 1 1 2 2 3 4 4 5 6 ...
##  $ hhweight          : num  627 627 627 627 627 ...
##  $ location          : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
##  $ region            : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
##  $ ethnicity.head    : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
##  $ age               : num  37 11 8 73 70 75 79 80 82 83 ...
##  $ gender            : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
##  $ relation          : Factor w/ 9 levels "head                      ",..: 1 3 3 1 2 1 1 2 1 1 ...
##  $ literate          : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ income.mnt        : num  13.3 13.3 13.3 142.5 142.5 ...
##  $ income            : num  160 160 160 1710 1710 ...
##  $ aggregate         : num  1042 1042 1042 3271 3271 ...
##  $ aggr.ind.annual   : num  347 347 347 1635 1635 ...
##  $ educ.completed    : int  2 4 4 4 3 3 3 3 4 4 ...
##  $ grade.complete    : num  4 3 0 3 4 4 4 4 5 5 ...
##  $ grade.all         : num  4 11 8 11 8 8 8 8 13 13 ...
##  $ unemployed        : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ reason.OLF        : int  NA NA NA 3 3 3 9 9 3 3 ...
##  $ sector            : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ occupation        : int  NA NA NA NA NA NA 5 5 NA NA ...
##  $ earn.mont         : num  0 0 0 0 0 0 20 20 0 0 ...
##  $ earn.ann          : num  0 0 0 0 0 0 240 240 0 0 ...
##  $ hours.week        : num  NA NA NA NA NA NA 30 35 NA NA ...
##  $ hours.mnt         : num  NA NA NA NA NA ...
##  $ fulltime          : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ hhexp             : num  100 100 100 343 343 ...
##  $ legacy.pension.amt: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr ""
##  - attr(*, "formats")= chr  "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
##  - attr(*, "types")= int  100 100 108 108 108 100 108 108 108 100 ...
##  - attr(*, "val.labels")= chr  "" "" "location" "region" ...
##  - attr(*, "var.labels")= chr  "hhid" "hhweight" "location" "region" ...
##  - attr(*, "expansion.fields")=List of 12
##   ..$ : chr  "_dta" "_svy_su1" "cluster"
##   ..$ : chr  "_dta" "_svy_strata1" "strata"
##   ..$ : chr  "_dta" "_svy_stages" "1"
##   ..$ : chr  "_dta" "_svy_version" "2"
##   ..$ : chr  "_dta" "__XijVarLabcons" "(sum) cons"
##   ..$ : chr  "_dta" "ReS_Xij" "cons"
##   ..$ : chr  "_dta" "ReS_str" "0"
##   ..$ : chr  "_dta" "ReS_j" "group"
##   ..$ : chr  "_dta" "ReS_ver" "v.2"
##   ..$ : chr  "_dta" "ReS_i" "hhid dur"
##   ..$ : chr  "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc,  gall, health, rent, durables we"| __truncated__
##   ..$ : chr  "_dta" "note0" "1"
##  - attr(*, "version")= int 7
##  - attr(*, "label.table")=List of 12
##   ..$ location: Named int  1 2
##   .. ..- attr(*, "names")= chr  "urban location" "rural location"
##   ..$ region  : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "Sofia city" "Bourgass" "Varna" "Lovetch" ...
##   ..$ ethnic  : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "Bulgaria" "Turks" "Roma" "Other"
##   ..$ s2_q2   : Named int  1 2
##   .. ..- attr(*, "names")= chr  "male" "female"
##   ..$ s2_q3   : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "head                      " "spouse/partner            " "child                     " "son/daughter-in-law       " ...
##   ..$ lit     : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"
##   ..$         : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "never attanded" "primary" "secondary" "postsecondary"
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "Not unemployed" "Unemployed"
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "student" "housewife/childcare" "in retirement" "illness, disability" ...
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "agriculture" "mining" "manufacturing" "utilities" ...
##   ..$         : Named int  1 2 3 4 5
##   .. ..- attr(*, "names")= chr  "private company" "public works program" "government,public sector, army" "private individual" ...
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"


# can see
# age is an integer
# literate is a factor, with the levels "yes" and "no". 
str(edu_equal_1)
## 'data.frame':    12214 obs. of  27 variables:
##  $ hhid              : num  1 1 1 2 2 3 4 4 5 6 ...
##  $ hhweight          : num  627 627 627 627 627 ...
##  $ location          : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
##  $ region            : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
##  $ ethnicity_head    : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
##  $ age               : num  37 11 8 73 70 75 79 80 82 83 ...
##  $ gender            : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
##  $ relation          : Factor w/ 9 levels "head                      ",..: 1 3 3 1 2 1 1 2 1 1 ...
##  $ literate          : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ income_mnt        : num  13.3 13.3 13.3 142.5 142.5 ...
##  $ income            : num  160 160 160 1710 1710 ...
##  $ aggregate         : num  1042 1042 1042 3271 3271 ...
##  $ aggr_ind_annual   : num  347 347 347 1635 1635 ...
##  $ educ_completed    : int  2 4 4 4 3 3 3 3 4 4 ...
##  $ grade_complete    : num  4 3 0 3 4 4 4 4 5 5 ...
##  $ grade_all         : num  4 11 8 11 8 8 8 8 13 13 ...
##  $ unemployed        : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ reason_OLF        : int  NA NA NA 3 3 3 9 9 3 3 ...
##  $ sector            : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ occupation        : int  NA NA NA NA NA NA 5 5 NA NA ...
##  $ earn_mont         : num  0 0 0 0 0 0 20 20 0 0 ...
##  $ earn_ann          : num  0 0 0 0 0 0 240 240 0 0 ...
##  $ hours_week        : num  NA NA NA NA NA NA 30 35 NA NA ...
##  $ hours_mnt         : num  NA NA NA NA NA ...
##  $ fulltime          : int  NA NA NA NA NA NA 1 1 NA NA ...
##  $ hhexp             : num  100 100 100 343 343 ...
##  $ legacy_pension_amt: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr ""
##  - attr(*, "formats")= chr  "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
##  - attr(*, "types")= int  100 100 108 108 108 100 108 108 108 100 ...
##  - attr(*, "val.labels")= chr  "" "" "location" "region" ...
##  - attr(*, "var.labels")= chr  "hhid" "hhweight" "location" "region" ...
##  - attr(*, "expansion.fields")=List of 12
##   ..$ : chr  "_dta" "_svy_su1" "cluster"
##   ..$ : chr  "_dta" "_svy_strata1" "strata"
##   ..$ : chr  "_dta" "_svy_stages" "1"
##   ..$ : chr  "_dta" "_svy_version" "2"
##   ..$ : chr  "_dta" "__XijVarLabcons" "(sum) cons"
##   ..$ : chr  "_dta" "ReS_Xij" "cons"
##   ..$ : chr  "_dta" "ReS_str" "0"
##   ..$ : chr  "_dta" "ReS_j" "group"
##   ..$ : chr  "_dta" "ReS_ver" "v.2"
##   ..$ : chr  "_dta" "ReS_i" "hhid dur"
##   ..$ : chr  "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc,  gall, health, rent, durables we"| __truncated__
##   ..$ : chr  "_dta" "note0" "1"
##  - attr(*, "version")= int 7
##  - attr(*, "label.table")=List of 12
##   ..$ location: Named int  1 2
##   .. ..- attr(*, "names")= chr  "urban location" "rural location"
##   ..$ region  : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "Sofia city" "Bourgass" "Varna" "Lovetch" ...
##   ..$ ethnic  : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "Bulgaria" "Turks" "Roma" "Other"
##   ..$ s2_q2   : Named int  1 2
##   .. ..- attr(*, "names")= chr  "male" "female"
##   ..$ s2_q3   : Named int  1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr  "head                      " "spouse/partner            " "child                     " "son/daughter-in-law       " ...
##   ..$ lit     : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"
##   ..$         : Named int  1 2 3 4
##   .. ..- attr(*, "names")= chr  "never attanded" "primary" "secondary" "postsecondary"
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "Not unemployed" "Unemployed"
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "student" "housewife/childcare" "in retirement" "illness, disability" ...
##   ..$         : Named int  1 2 3 4 5 6 7 8 9 10
##   .. ..- attr(*, "names")= chr  "agriculture" "mining" "manufacturing" "utilities" ...
##   ..$         : Named int  1 2 3 4 5
##   .. ..- attr(*, "names")= chr  "private company" "public works program" "government,public sector, army" "private individual" ...
##   ..$         : Named int  1 2
##   .. ..- attr(*, "names")= chr  "no" "yes"

# how many observations (e.g. how many people) have an age higher than 40
# and are literate?
nrow(subset(edu_equal_1, age > 40 & literate == "yes"))
## [1] 6506

# how many observations/individuals 
# from Bulgaria 
# have an income above 1000?
nrow(subset(edu_equal_1, income > 1000 & ethnicity_head == "Bulgaria"))  
## [1] 8997
  
##############################################
# cat("\014") # clear screen before next section
##############################################

# socio-economic variables from different countries
# Source: Quantative Data Analysis in Education
# http://cw.routledge.com/textbooks/9780415372985/resources/datasets.asp
# http://cw.routledge.com/textbooks/9780415372985/sav/international.sav
list.files()
##  [1] "1 - importing-data-from-databases.R"                      
##  [2] "1 - importing-data-from-databases.Rmd"                    
##  [3] "1_-_importing-data-from-databases.html"                   
##  [4] "2 - importing-data-from-statistical-software-packages.R"  
##  [5] "2 - importing-data-from-statistical-software-packages.Rmd"
##  [6] "2_-_importing-data-from-statistical-software-packages.Rmd"
##  [7] "3 - importing-data-from-web.R"                            
##  [8] "importing_data_in_r_2_ch1.pdf"                            
##  [9] "importing_data_in_r_2_ch2.pdf"                            
## [10] "importing_data_in_r_2_ch3.pdf"                            
## [11] "importing_data_in_r_2_ch4.pdf"                            
## [12] "importing_data_in_r_2_ch5.pdf"                            
## [13] "rsconnect"                                                
## [14] "worldbank"
download.file(
  "http://cw.routledge.com/textbooks/9780415372985/sav/international.sav", 
  destfile = "international.sav")
list.files()
##  [1] "1 - importing-data-from-databases.R"                      
##  [2] "1 - importing-data-from-databases.Rmd"                    
##  [3] "1_-_importing-data-from-databases.html"                   
##  [4] "2 - importing-data-from-statistical-software-packages.R"  
##  [5] "2 - importing-data-from-statistical-software-packages.Rmd"
##  [6] "2_-_importing-data-from-statistical-software-packages.Rmd"
##  [7] "3 - importing-data-from-web.R"                            
##  [8] "importing_data_in_r_2_ch1.pdf"                            
##  [9] "importing_data_in_r_2_ch2.pdf"                            
## [10] "importing_data_in_r_2_ch3.pdf"                            
## [11] "importing_data_in_r_2_ch4.pdf"                            
## [12] "importing_data_in_r_2_ch5.pdf"                            
## [13] "international.sav"                                        
## [14] "rsconnect"                                                
## [15] "worldbank"
download.file(
  url = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/international.sav",
  destfile = "international.sav")
list.files()
##  [1] "1 - importing-data-from-databases.R"                      
##  [2] "1 - importing-data-from-databases.Rmd"                    
##  [3] "1_-_importing-data-from-databases.html"                   
##  [4] "2 - importing-data-from-statistical-software-packages.R"  
##  [5] "2 - importing-data-from-statistical-software-packages.Rmd"
##  [6] "2_-_importing-data-from-statistical-software-packages.Rmd"
##  [7] "3 - importing-data-from-web.R"                            
##  [8] "importing_data_in_r_2_ch1.pdf"                            
##  [9] "importing_data_in_r_2_ch2.pdf"                            
## [10] "importing_data_in_r_2_ch3.pdf"                            
## [11] "importing_data_in_r_2_ch4.pdf"                            
## [12] "importing_data_in_r_2_ch5.pdf"                            
## [13] "international.sav"                                        
## [14] "rsconnect"                                                
## [15] "worldbank"

# Import international.sav as a data frame: demo
demo <- read.spss("international.sav", to.data.frame = TRUE)
## re-encoding from CP1252

# Create boxplot of gdp variable of demo
boxplot(demo$gdp)


##############################################
# cat("\014") # clear screen before next section
##############################################

# Pearson's Correlation
# 
# measurement to evaluate the linear dependency between two variables, say X and Y.
# It can range from -1 to 1; 
# if it's close to 1
# it means that there is a strong positive association between the variables. 
# If X is high, also Y tends to be high.
# If it's close to -1, there is a strong negative association: 
# If X is high, Y tends to be low. 
# When the Pearson correlation between two variables is 0, 
# these variables are possibly independent: 
# there is no association between X and Y.
# 
# What is the correlation coefficient 
# for the two numerical variables 
# gdp and f_illit (female illiteracy rate)?
cor(demo$gdp, demo$f_illit)
## [1] -0.4476856
# That indicates a negative association among GDP and female illiteracy

##############################################
# cat("\014") # clear screen before next section
##############################################

# Import international.sav as demo_1
demo_1 <- read.spss("international.sav", to.data.frame = TRUE)
## re-encoding from CP1252

# Print out the head of demo_1
head(demo_1)
##   id              country  contint m_illit f_illit lifeexpt  gdp
## 1  1 Argentina            Americas     3.0     3.0       16 3375
## 2  2 Benin                  Africa    45.2    74.5        7  521
## 3  3 Burundi                Africa    33.2    48.1        5   86
## 4  4 Chile                Americas     4.2     4.4       14 4523
## 5  5 Dominican Republic   Americas    12.0    12.7       12 2408
## 6  6 El Salvador          Americas    17.6    22.9       11 2302

# Import international.sav as demo_2
# variables with value labels are NOT converted to R factors
demo_2 <- read.spss("international.sav", to.data.frame = TRUE, use.value.labels = FALSE)
## re-encoding from CP1252

# Print out the head of demo_2
head(demo_2)
##   id              country contint m_illit f_illit lifeexpt  gdp
## 1  1 Argentina                  2     3.0     3.0       16 3375
## 2  2 Benin                      1    45.2    74.5        7  521
## 3  3 Burundi                    1    33.2    48.1        5   86
## 4  4 Chile                      2     4.2     4.4       14 4523
## 5  5 Dominican Republic         2    12.0    12.7       12 2408
## 6  6 El Salvador                2    17.6    22.9       11 2302