Course notes for Importing Data in R (Part 2)
DBI
, RMySQL
, SQL examplesreadr
, gdata
, and readxl
from the web,httr
jsonlite
haven
and foreign
with SAS, STATA, and SPSS files Â
## The DBI package is already loaded
# Edit dbConnect() call
con <- dbConnect(RMySQL::MySQL(),
dbname = "tweater",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
class(con)
## [1] "MySQLConnection"
## attr(,"package")
## [1] "RMySQL"
## The DBI package is already loaded
## The MySQL database connection is already made with con
# Build a vector of table names: tables
tables <- dbListTables(con)
# Display structure of tables
str(tables)
## chr [1:3] "comments" "tweats" "users"
# Import the users table from tweater: users
users <- dbReadTable(con, "users")
# Print users
users
## id name login
## 1 1 elisabeth elismith
## 2 2 mike mikey
## 3 3 thea teatime
## 4 4 thomas tomatotom
## 5 5 oliver olivander
## 6 6 kate katebenn
## 7 7 anjali lianja
# Get table names
table_names <- dbListTables(con)
# Import all tables
tables <- lapply(table_names, dbReadTable, conn = con)
# Print out tables
tables
## [[1]]
## id tweat_id user_id message
## 1 1022 87 7 nice!
## 2 1000 77 7 great!
## 3 1011 49 5 love it
## 4 1012 87 1 awesome! thanks!
## 5 1010 88 6 yuck!
## 6 1026 77 4 not my thing!
## 7 1004 49 1 this is fabulous!
## 8 1030 75 6 so easy!
## 9 1025 88 2 oh yes
## 10 1007 49 3 serious?
## 11 1020 77 1 couldn't be better
## 12 1014 77 1 saved my day
##
## [[2]]
## id user_id
## 1 75 3
## 2 88 4
## 3 77 6
## 4 87 5
## 5 49 1
## 6 24 7
## post
## 1 break egg. bake egg. eat egg.
## 2 wash strawberries. add ice. blend. enjoy.
## 3 2 slices of bread. add cheese. grill. heaven.
## 4 open and crush avocado. add shrimps. perfect starter.
## 5 nachos. add tomato sauce, minced meat and cheese. oven for 10 mins.
## 6 just eat an apple. simply and healthy.
## date
## 1 2015-09-05
## 2 2015-09-14
## 3 2015-09-21
## 4 2015-09-22
## 5 2015-09-22
## 6 2015-09-24
##
## [[3]]
## id name login
## 1 1 elisabeth elismith
## 2 2 mike mikey
## 3 3 thea teatime
## 4 4 thomas tomatotom
## 5 5 oliver olivander
## 6 6 kate katebenn
## 7 7 anjali lianja
 Â
# Import tweat_id column of comments where user_id is 1: elisabeth
elisabeth <- dbGetQuery(con, "select tweat_id from comments where user_id = 1")
# Print elisabeth
elisabeth
## tweat_id
## 1 87
## 2 49
## 3 77
## 4 77
# Import post column of tweats where date is higher than '2015-09-21': latest
latest <- dbGetQuery(con, "select post from tweats where date > '2015-09-21'")
# Print latest
latest
## post
## 1 open and crush avocado. add shrimps. perfect starter.
## 2 nachos. add tomato sauce, minced meat and cheese. oven for 10 mins.
## 3 just eat an apple. simply and healthy.
# Create data frame specific
specific <- dbGetQuery(con, "select message from comments where tweat_id = 77 AND user_id > 4")
# Print specific
specific
## message
## 1 great!
# Create data frame short
short <- dbGetQuery(con, "select id, name from users where CHAR_LENGTH(name) < 5")
# Print short
short
## id name
## 1 2 mike
## 2 3 thea
## 3 6 kate
sql <- "SELECT post, message
FROM tweats INNER JOIN comments on tweats.id = tweat_id
WHERE tweat_id = 77"
df <- dbGetQuery(con, sql)
df
## post message
## 1 2 slices of bread. add cheese. grill. heaven. great!
## 2 2 slices of bread. add cheese. grill. heaven. not my thing!
## 3 2 slices of bread. add cheese. grill. heaven. couldn't be better
## 4 2 slices of bread. add cheese. grill. heaven. saved my day
dbSendQuery
, dbFetch
, and dbClearResults
# Send query to the database
res <- dbSendQuery(con, "SELECT * FROM comments WHERE user_id > 4")
# Use dbFetch() twice
dbFetch(res, 2)
## id tweat_id user_id message
## 1 1022 87 7 nice!
## 2 1000 77 7 great!
dbFetch(res, 2)
## id tweat_id user_id message
## 1 1011 49 5 love it
## 2 1010 88 6 yuck!
# Clear res
dbClearResult(res)
## [1] TRUE
# Create the data frame long_tweats
long_tweats <- dbGetQuery(con, "select post, date from tweats where CHAR_LENGTH(post) > 40")
# Print long_tweats
long_tweats
## post
## 1 wash strawberries. add ice. blend. enjoy.
## 2 2 slices of bread. add cheese. grill. heaven.
## 3 open and crush avocado. add shrimps. perfect starter.
## 4 nachos. add tomato sauce, minced meat and cheese. oven for 10 mins.
## date
## 1 2015-09-14
## 2 2015-09-21
## 3 2015-09-22
## 4 2015-09-22
# Disconnect from the database
dbDisconnect(con)
## [1] TRUE
 Â
# The readr package is already loaded
# Import the csv file: pools
url_csv <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/swimming_pools.csv"
pools <- read_csv(url_csv)
head(pools)
## # A tibble: 6 x 4
## Name Address
## <chr> <chr>
## 1 Acacia Ridge Leisure Centre 1391 Beaudesert Road, Acacia Ridge
## 2 Bellbowrie Pool Sugarwood Street, Bellbowrie
## 3 Carole Park Cnr Boundary Road and Waterford Road Wacol
## 4 Centenary Pool (inner City) 400 Gregory Terrace, Spring Hill
## 5 Chermside Pool 375 Hamilton Road, Chermside
## 6 Colmslie Pool (Morningside) 400 Lytton Road, Morningside
## # ... with 2 more variables: Latitude <dbl>, Longitude <dbl>
# Import the txt file: potatoes
url_delim <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/potatoes.txt"
potatoes <- read_tsv(url_delim)
head(potatoes)
## # A tibble: 6 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 1 1 2.9 3.2 3.0
## 2 1 1 1 1 2 2.3 2.5 2.6
## 3 1 1 1 1 3 2.5 2.8 2.8
## 4 1 1 1 1 4 2.1 2.9 2.4
## 5 1 1 1 1 5 1.9 2.8 2.2
## 6 1 1 1 2 1 1.8 3.0 1.7
# https URL to the swimming_pools csv file.
url_csv <- "https://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/swimming_pools.csv"
# Import the file using read.csv(): pools1
pools1 <- read.csv(url_csv)
str(pools1)
## 'data.frame': 20 obs. of 4 variables:
## $ Name : Factor w/ 20 levels "Acacia Ridge Leisure Centre",..: 1 2 3 4 5 6 19 7 8 9 ...
## $ Address : Factor w/ 20 levels "1 Fairlead Crescent, Manly",..: 5 20 18 10 9 11 6 15 12 17 ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
# Import the file using read_csv(): pools2
pools2 <- read_csv(url_csv)
str(pools2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 20 obs. of 4 variables:
## $ Name : chr "Acacia Ridge Leisure Centre" "Bellbowrie Pool" "Carole Park" "Centenary Pool (inner City)" ...
## $ Address : chr "1391 Beaudesert Road, Acacia Ridge" "Sugarwood Street, Bellbowrie" "Cnr Boundary Road and Waterford Road Wacol" "400 Gregory Terrace, Spring Hill" ...
## $ Latitude : num -27.6 -27.6 -27.6 -27.5 -27.4 ...
## $ Longitude: num 153 153 153 153 153 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 4
## .. ..$ Name : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Address : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Latitude : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Longitude: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# The readxl and gdata packages are already loaded
# Specification of url: url_xls
url_xls <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/latitude.xls"
# Import the .xls file with gdata: excel_gdata
excel_gdata <- read.xls(url_xls)
str(excel_gdata)
## 'data.frame': 246 obs. of 2 variables:
## $ country: Factor w/ 246 levels "\xc5land","Afghanistan",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ X1700 : num 34.6 34.6 41.3 36.7 -14.3 ...
# Download file behind URL, name it local_latitude.xls
download.file(url_xls,"data/local_latitude.xls")
# Import the local .xls file with readxl: excel_readxl
excel_readxl <- read_excel("data/local_latitude.xls")
str(excel_readxl)
## Classes 'tbl_df', 'tbl' and 'data.frame': 246 obs. of 2 variables:
## $ country: chr "Afghanistan" "Akrotiri and Dhekelia" "Albania" "Algeria" ...
## $ 1700 : num 34.6 34.6 41.3 36.7 -14.3 ...
load
used to load Rdata
files does not accept urls so if there is an RData
file on the web you will need to download it firstdownload.file
# https URL to the wine RData file.
url_rdata <- "https://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/wine.RData"
# Download the wine file to your working directory
download.file(url_rdata, "data/wine_local.RData")
# Load the wine data into your workspace using load()
load("data/wine_local.RData")
# Print out the summary of the wine data
summary(wine)
## Alcohol Malic acid Ash Alcalinity of ash
## Min. :11.03 Min. :0.74 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.60 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.87 Median :2.360 Median :19.50
## Mean :12.99 Mean :2.34 Mean :2.366 Mean :19.52
## 3rd Qu.:13.67 3rd Qu.:3.10 3rd Qu.:2.560 3rd Qu.:21.50
## Max. :14.83 Max. :5.80 Max. :3.230 Max. :30.00
## Magnesium Total phenols Flavanoids Nonflavanoid phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.740 1st Qu.:1.200 1st Qu.:0.2700
## Median : 98.00 Median :2.350 Median :2.130 Median :0.3400
## Mean : 99.59 Mean :2.292 Mean :2.023 Mean :0.3623
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.860 3rd Qu.:0.4400
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color intensity Hue Proline
## Min. :0.410 Min. : 1.280 Min. :1.270 Min. : 278.0
## 1st Qu.:1.250 1st Qu.: 3.210 1st Qu.:1.930 1st Qu.: 500.0
## Median :1.550 Median : 4.680 Median :2.780 Median : 672.0
## Mean :1.587 Mean : 5.055 Mean :2.604 Mean : 745.1
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:3.170 3rd Qu.: 985.0
## Max. :3.580 Max. :13.000 Max. :4.000 Max. :1680.0
## These all work
read.csv("http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/swimming_pools.csv") %>% head()
## Name Address
## 1 Acacia Ridge Leisure Centre 1391 Beaudesert Road, Acacia Ridge
## 2 Bellbowrie Pool Sugarwood Street, Bellbowrie
## 3 Carole Park Cnr Boundary Road and Waterford Road Wacol
## 4 Centenary Pool (inner City) 400 Gregory Terrace, Spring Hill
## 5 Chermside Pool 375 Hamilton Road, Chermside
## 6 Colmslie Pool (Morningside) 400 Lytton Road, Morningside
## Latitude Longitude
## 1 -27.58616 153.0264
## 2 -27.56547 152.8911
## 3 -27.60744 152.9315
## 4 -27.45537 153.0251
## 5 -27.38583 153.0351
## 6 -27.45516 153.0789
read_tsv("http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/potatoes.txt") %>% head()
## # A tibble: 6 x 8
## area temp size storage method texture flavor moistness
## <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 1 1 1 1 2.9 3.2 3.0
## 2 1 1 1 1 2 2.3 2.5 2.6
## 3 1 1 1 1 3 2.5 2.8 2.8
## 4 1 1 1 1 4 2.1 2.9 2.4
## 5 1 1 1 1 5 1.9 2.8 2.2
## 6 1 1 1 2 1 1.8 3.0 1.7
download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/latitude.xlsx", "data/lat.xlsx")
## This will not work. you need to download the RData file first then load it
# load("https://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/wine.RData")
# The httr package is already loaded
# Get the url, save response to resp
url <- "http://www.example.com/"
resp <- GET(url)
# Print resp
resp
## Response [http://www.example.com/]
## Date: 2017-07-11 13:47
## Status: 200
## Content-Type: text/html
## Size: 1.27 kB
## <!doctype html>
## <html>
## <head>
## <title>Example Domain</title>
##
## <meta charset="utf-8" />
## <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
## <meta name="viewport" content="width=device-width, initial-scale=1" />
## <style type="text/css">
## body {
## ...
# Get the raw content of resp: raw_content
raw_content <- content(resp, as='raw')
# Print the head of raw_content
head(raw_content)
## [1] 3c 21 64 6f 63 74
# Get the url
url <- "http://www.omdbapi.com/?apikey=ff21610b&t=Annie+Hall&y=&plot=short&r=json"
resp <- GET(url)
# Print resp
resp
## Response [http://www.omdbapi.com/?apikey=ff21610b&t=Annie+Hall&y=&plot=short&r=json]
## Date: 2017-07-11 13:47
## Status: 200
## Content-Type: application/json; charset=utf-8
## Size: 902 B
# Print content of resp as text
content(resp, as = 'text')
## [1] "{\"Title\":\"Annie Hall\",\"Year\":\"1977\",\"Rated\":\"PG\",\"Released\":\"20 Apr 1977\",\"Runtime\":\"93 min\",\"Genre\":\"Comedy, Romance\",\"Director\":\"Woody Allen\",\"Writer\":\"Woody Allen, Marshall Brickman\",\"Actors\":\"Woody Allen, Diane Keaton, Tony Roberts, Carol Kane\",\"Plot\":\"Neurotic New York comedian Alvy Singer falls in love with the ditzy Annie Hall.\",\"Language\":\"English, German\",\"Country\":\"USA\",\"Awards\":\"Won 4 Oscars. Another 26 wins & 8 nominations.\",\"Poster\":\"https://images-na.ssl-images-amazon.com/images/M/MV5BZDg1OGQ4YzgtM2Y2NS00NjA3LWFjYTctMDRlMDI3NWE1OTUyXkEyXkFqcGdeQXVyMjUzOTY1NTc@._V1_SX300.jpg\",\"Ratings\":[{\"Source\":\"Internet Movie Database\",\"Value\":\"8.1/10\"},{\"Source\":\"Rotten Tomatoes\",\"Value\":\"97%\"}],\"Metascore\":\"N/A\",\"imdbRating\":\"8.1\",\"imdbVotes\":\"208,523\",\"imdbID\":\"tt0075686\",\"Type\":\"movie\",\"DVD\":\"28 Apr 1998\",\"BoxOffice\":\"N/A\",\"Production\":\"United Artists\",\"Website\":\"N/A\",\"Response\":\"True\"}"
# Print content of resp
content(resp)
## $Title
## [1] "Annie Hall"
##
## $Year
## [1] "1977"
##
## $Rated
## [1] "PG"
##
## $Released
## [1] "20 Apr 1977"
##
## $Runtime
## [1] "93 min"
##
## $Genre
## [1] "Comedy, Romance"
##
## $Director
## [1] "Woody Allen"
##
## $Writer
## [1] "Woody Allen, Marshall Brickman"
##
## $Actors
## [1] "Woody Allen, Diane Keaton, Tony Roberts, Carol Kane"
##
## $Plot
## [1] "Neurotic New York comedian Alvy Singer falls in love with the ditzy Annie Hall."
##
## $Language
## [1] "English, German"
##
## $Country
## [1] "USA"
##
## $Awards
## [1] "Won 4 Oscars. Another 26 wins & 8 nominations."
##
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BZDg1OGQ4YzgtM2Y2NS00NjA3LWFjYTctMDRlMDI3NWE1OTUyXkEyXkFqcGdeQXVyMjUzOTY1NTc@._V1_SX300.jpg"
##
## $Ratings
## $Ratings[[1]]
## $Ratings[[1]]$Source
## [1] "Internet Movie Database"
##
## $Ratings[[1]]$Value
## [1] "8.1/10"
##
##
## $Ratings[[2]]
## $Ratings[[2]]$Source
## [1] "Rotten Tomatoes"
##
## $Ratings[[2]]$Value
## [1] "97%"
##
##
##
## $Metascore
## [1] "N/A"
##
## $imdbRating
## [1] "8.1"
##
## $imdbVotes
## [1] "208,523"
##
## $imdbID
## [1] "tt0075686"
##
## $Type
## [1] "movie"
##
## $DVD
## [1] "28 Apr 1998"
##
## $BoxOffice
## [1] "N/A"
##
## $Production
## [1] "United Artists"
##
## $Website
## [1] "N/A"
##
## $Response
## [1] "True"
 Â
# The jsonlite package is already loaded
# wine_json is a JSON
wine_json <- '{"name":"Chateau Migraine", "year":1997, "alcohol_pct":12.4, "color":"red", "awarded":false}'
# Convert wine_json into a list: wine
wine <- fromJSON(wine_json)
# Print structure of wine
str(wine)
## List of 5
## $ name : chr "Chateau Migraine"
## $ year : int 1997
## $ alcohol_pct: num 12.4
## $ color : chr "red"
## $ awarded : logi FALSE
wine
## $name
## [1] "Chateau Migraine"
##
## $year
## [1] 1997
##
## $alcohol_pct
## [1] 12.4
##
## $color
## [1] "red"
##
## $awarded
## [1] FALSE
# Definition of quandl_url
quandl_url <- "http://www.quandl.com/api/v1/datasets/IWS/INTERNET_INDIA.json?auth_token=i83asDsiWUUyfoypkgMz"
# Import Quandl data: quandl_data
quandl_data <- fromJSON(quandl_url)
# Print structure of quandl_data
str(quandl_data)
## List of 17
## $ errors : Named list()
## $ id : int 2351831
## $ source_name : chr "Internet World Stats"
## $ source_code : chr "IWS"
## $ code : chr "INTERNET_INDIA"
## $ name : chr "India Internet Usage"
## $ urlize_name : chr "India-Internet-Usage"
## $ display_url : chr "http://www.internetworldstats.com/asia/in.htm"
## $ description : chr "Internet Usage and Population Statistics"
## $ updated_at : chr "2016-01-01T04:23:55.235Z"
## $ frequency : chr "annual"
## $ from_date : chr "1998-12-31"
## $ to_date : chr "2012-12-31"
## $ column_names: chr [1:4] "YEAR" "Users" "Population" "% Pen."
## $ premium : logi FALSE
## $ data : chr [1:13, 1:4] "2012-12-31" "2010-12-31" "2009-12-31" "2007-12-31" ...
## $ type : chr "Time Series"
# Definition of the URLs
url_sw4 <- "http://www.omdbapi.com/?apikey=ff21610b&i=tt0076759&r=json"
url_sw3 <- "http://www.omdbapi.com/?apikey=ff21610b&i=tt0121766&r=json"
# Import two URLs with fromJSON(): sw4 and sw3
sw4 <- fromJSON(url_sw4)
sw3 <- fromJSON(url_sw3)
# Print out the Title element of both lists
sw4$Title
## [1] "Star Wars: Episode IV - A New Hope"
sw3$Title
## [1] "Star Wars: Episode III - Revenge of the Sith"
# Is the release year of sw4 later than sw3?
str(sw4)
## List of 25
## $ Title : chr "Star Wars: Episode IV - A New Hope"
## $ Year : chr "1977"
## $ Rated : chr "PG"
## $ Released : chr "25 May 1977"
## $ Runtime : chr "121 min"
## $ Genre : chr "Action, Adventure, Fantasy"
## $ Director : chr "George Lucas"
## $ Writer : chr "George Lucas"
## $ Actors : chr "Mark Hamill, Harrison Ford, Carrie Fisher, Peter Cushing"
## $ Plot : chr "Luke Skywalker joins forces with a Jedi Knight, a cocky pilot, a wookiee and two droids to save the galaxy from"| __truncated__
## $ Language : chr "English"
## $ Country : chr "USA"
## $ Awards : chr "Won 6 Oscars. Another 50 wins & 28 nominations."
## $ Poster : chr "https://images-na.ssl-images-amazon.com/images/M/MV5BYzQ2OTk4N2QtOGQwNy00MmI3LWEwNmEtOTk0OTY3NDk2MGJkL2ltYWdlL2"| __truncated__
## $ Ratings :'data.frame': 3 obs. of 2 variables:
## ..$ Source: chr [1:3] "Internet Movie Database" "Rotten Tomatoes" "Metacritic"
## ..$ Value : chr [1:3] "8.7/10" "93%" "92/100"
## $ Metascore : chr "92"
## $ imdbRating: chr "8.7"
## $ imdbVotes : chr "982,688"
## $ imdbID : chr "tt0076759"
## $ Type : chr "movie"
## $ DVD : chr "21 Sep 2004"
## $ BoxOffice : chr "N/A"
## $ Production: chr "20th Century Fox"
## $ Website : chr "http://www.starwars.com/episode-iv/"
## $ Response : chr "True"
sw4$Year > sw3$Year
## [1] FALSE
jsonlite
package doe a good job of pulling json data into dataframes or lists
minify
and prettify
to help viewing of json data# The jsonlite package is already loaded
# Challenge 1
json1 <- '[1, 2, 3, 4, 5, 6]'
fromJSON(json1)
## [1] 1 2 3 4 5 6
# Challenge 2
json2 <- '{"a": [1, 2, 3], "b": [4, 5, 6]}'
fromJSON(json2)
## $a
## [1] 1 2 3
##
## $b
## [1] 4 5 6
# Challenge 1
json1 <- '[[1, 2], [3, 4]]'
fromJSON(json1)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
# Challenge 2
json2 <- '[{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}]'
fromJSON(json2)
## a b
## 1 1 2
## 2 3 4
## 3 5 6
# URL pointing to the .csv file
url_csv <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/water.csv"
# Import the .csv file located at url_csv
water <- read.csv(url_csv, stringsAsFactors=F)
str(water)
## 'data.frame': 77 obs. of 5 variables:
## $ water: chr "Algeria" "American Samoa" "Angola" "Antigua and Barbuda" ...
## $ X1992: num 0.064 NA 0.000138 0.0033 0.000749 ...
## $ X1997: num NA NA NA NA 0.000749 ...
## $ X2002: num 0.017 NA 0.000138 NA 0.000749 ...
## $ X2007: num NA NA NA NA NA ...
# Convert the data file according to the requirements
water_json <- toJSON(head(water))
# Print out water_json
water_json
## [{"water":"Algeria","X1992":0.064,"X2002":0.017},{"water":"American Samoa"},{"water":"Angola","X1992":0.0001,"X2002":0.0001},{"water":"Antigua and Barbuda","X1992":0.0033},{"water":"Argentina","X1992":0.0007,"X1997":0.0007,"X2002":0.0007},{"water":"Australia","X1992":0.0298,"X2002":0.0298}]
# Convert mtcars to a pretty JSON: pretty_json
## I'm going to shorten this to just 3 rows to save output space
pretty_json <- toJSON(head(mtcars,3), pretty=T)
# Print pretty_json
pretty_json
## [
## {
## "mpg": 21,
## "cyl": 6,
## "disp": 160,
## "hp": 110,
## "drat": 3.9,
## "wt": 2.62,
## "qsec": 16.46,
## "vs": 0,
## "am": 1,
## "gear": 4,
## "carb": 4,
## "_row": "Mazda RX4"
## },
## {
## "mpg": 21,
## "cyl": 6,
## "disp": 160,
## "hp": 110,
## "drat": 3.9,
## "wt": 2.875,
## "qsec": 17.02,
## "vs": 0,
## "am": 1,
## "gear": 4,
## "carb": 4,
## "_row": "Mazda RX4 Wag"
## },
## {
## "mpg": 22.8,
## "cyl": 4,
## "disp": 108,
## "hp": 93,
## "drat": 3.85,
## "wt": 2.32,
## "qsec": 18.61,
## "vs": 1,
## "am": 1,
## "gear": 4,
## "carb": 1,
## "_row": "Datsun 710"
## }
## ]
# Minify pretty_json: mini_json
mini_json <- minify(pretty_json)
# Print mini_json
mini_json
## [{"mpg":21,"cyl":6,"disp":160,"hp":110,"drat":3.9,"wt":2.62,"qsec":16.46,"vs":0,"am":1,"gear":4,"carb":4,"_row":"Mazda RX4"},{"mpg":21,"cyl":6,"disp":160,"hp":110,"drat":3.9,"wt":2.875,"qsec":17.02,"vs":0,"am":1,"gear":4,"carb":4,"_row":"Mazda RX4 Wag"},{"mpg":22.8,"cyl":4,"disp":108,"hp":93,"drat":3.85,"wt":2.32,"qsec":18.61,"vs":1,"am":1,"gear":4,"carb":1,"_row":"Datsun 710"}]
 Â
haven
can load data from a variety of differnt common statistical tools.
# The haven package is already loaded
# Import sales.sas7bdat: sales
sales <- read_sas('http://assets.datacamp.com/course/importing_data_into_r/sales.sas7bdat')
# Display the structure of sales
str(sales)
## Classes 'tbl_df', 'tbl' and 'data.frame': 431 obs. of 4 variables:
## $ purchase: num 0 0 1 1 0 0 0 0 0 0 ...
## $ age : num 41 47 41 39 32 32 33 45 43 40 ...
## $ gender : chr "Female" "Female" "Female" "Female" ...
## $ income : chr "Low" "Low" "Low" "Low" ...
## - attr(*, "label")= chr "SALES"
# Import the data from the URL: sugar
sugar <- read_stata('http://assets.datacamp.com/production/course_1478/datasets/trade.dta')
# Structure of sugar
str(sugar)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 5 variables:
## $ Date :Class 'labelled' atomic [1:10] 10 9 8 7 6 5 4 3 2 1
## .. ..- attr(*, "label")= chr "Date"
## .. ..- attr(*, "format.stata")= chr "%9.0g"
## .. ..- attr(*, "labels")= Named num [1:10] 1 2 3 4 5 6 7 8 9 10
## .. .. ..- attr(*, "names")= chr [1:10] "2004-12-31" "2005-12-31" "2006-12-31" "2007-12-31" ...
## $ Import : atomic 37664782 16316512 11082246 35677943 9879878 ...
## ..- attr(*, "label")= chr "Import"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_I: atomic 54029106 21584365 14526089 55034932 14806865 ...
## ..- attr(*, "label")= chr "Weight_I"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Export : atomic 5.45e+07 1.03e+08 3.79e+07 4.85e+07 7.15e+07 ...
## ..- attr(*, "label")= chr "Export"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_E: atomic 9.34e+07 1.58e+08 8.80e+07 1.12e+08 1.32e+08 ...
## ..- attr(*, "label")= chr "Weight_E"
## ..- attr(*, "format.stata")= chr "%9.0g"
## - attr(*, "label")= chr "Written by R."
# Convert values in Date column to dates
sugar$Date <- as.Date(as_factor(sugar$Date))
# Structure of sugar again
str(sugar)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10 obs. of 5 variables:
## $ Date : Date, format: "2013-12-31" "2012-12-31" ...
## $ Import : atomic 37664782 16316512 11082246 35677943 9879878 ...
## ..- attr(*, "label")= chr "Import"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_I: atomic 54029106 21584365 14526089 55034932 14806865 ...
## ..- attr(*, "label")= chr "Weight_I"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Export : atomic 5.45e+07 1.03e+08 3.79e+07 4.85e+07 7.15e+07 ...
## ..- attr(*, "label")= chr "Export"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ Weight_E: atomic 9.34e+07 1.58e+08 8.80e+07 1.12e+08 1.32e+08 ...
## ..- attr(*, "label")= chr "Weight_E"
## ..- attr(*, "format.stata")= chr "%9.0g"
## - attr(*, "label")= chr "Written by R."
sugar <- read_stata('http://assets.datacamp.com/production/course_1478/datasets/trade.dta')
plot(sugar$Import, sugar$Weight_I)
They are relatively positively correlated
# Import person.sav: traits
traits <- read_sav('http://assets.datacamp.com/production/course_1478/datasets/person.sav')
# Summarize traits
summary(traits)
## Neurotic Extroversion Agreeableness Conscientiousness
## Min. : 0.00 Min. : 5.00 Min. :15.00 Min. : 7.00
## 1st Qu.:18.00 1st Qu.:26.00 1st Qu.:39.00 1st Qu.:25.00
## Median :24.00 Median :31.00 Median :45.00 Median :30.00
## Mean :23.63 Mean :30.23 Mean :44.55 Mean :30.85
## 3rd Qu.:29.00 3rd Qu.:34.00 3rd Qu.:50.00 3rd Qu.:36.00
## Max. :44.00 Max. :65.00 Max. :73.00 Max. :58.00
## NA's :14 NA's :16 NA's :19 NA's :14
# Print out a subset
subset(traits,
subset = Extroversion > 40 & Agreeableness > 40)
## # A tibble: 8 x 4
## Neurotic Extroversion Agreeableness Conscientiousness
## <dbl> <dbl> <dbl> <dbl>
## 1 38 43 49 29
## 2 20 42 46 31
## 3 18 42 49 31
## 4 42 43 44 29
## 5 30 42 51 24
## 6 18 42 50 25
## 7 27 45 55 23
## 8 18 43 57 34
# Import SPSS data from the URL: work
work <- read_sav('http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/employee.sav')
# Display summary of work$GENDER
summary(work$GENDER)
## Length Class Mode
## 474 labelled character
# Convert work$GENDER to a factor
work$GENDER <- as_factor(work$GENDER)
# Display summary of work$GENDER again
summary(work$GENDER)
## Female Male
## 216 258
foreign
can import data from SAS, STATA, SPSS as well as SYSTAT, Weka, and others.haven
.foreign
cannot import .sas7bdat for some reason, but only .xport libraries. Checkout sas7bdata
package if you don’t want to use haven# The foreign package is already loaded
# Import florida.dta and name the resulting data frame florida
florida <- read.dta('http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/florida.dta')
# Check tail() of florida
tail(florida)
## gore bush buchanan nader total
## 62 2647 4051 27 59 6784
## 63 1399 2326 26 29 3780
## 64 97063 82214 396 2436 182109
## 65 3835 4511 46 149 8541
## 66 5637 12176 120 265 18198
## 67 2796 4983 88 93 7960
# Specify the file path using file.path(): path
path <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/edequality.dta'
# Create and print structure of edu_equal_1
edu_equal_1 <- read.dta(path)
str(edu_equal_1)
## 'data.frame': 12214 obs. of 27 variables:
## $ hhid : num 1 1 1 2 2 3 4 4 5 6 ...
## $ hhweight : num 627 627 627 627 627 ...
## $ location : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
## $ region : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
## $ ethnicity_head : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
## $ age : num 37 11 8 73 70 75 79 80 82 83 ...
## $ gender : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
## $ relation : Factor w/ 9 levels "head ",..: 1 3 3 1 2 1 1 2 1 1 ...
## $ literate : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ income_mnt : num 13.3 13.3 13.3 142.5 142.5 ...
## $ income : num 160 160 160 1710 1710 ...
## $ aggregate : num 1042 1042 1042 3271 3271 ...
## $ aggr_ind_annual : num 347 347 347 1635 1635 ...
## $ educ_completed : int 2 4 4 4 3 3 3 3 4 4 ...
## $ grade_complete : num 4 3 0 3 4 4 4 4 5 5 ...
## $ grade_all : num 4 11 8 11 8 8 8 8 13 13 ...
## $ unemployed : int 2 1 1 1 1 1 1 1 1 1 ...
## $ reason_OLF : int NA NA NA 3 3 3 9 9 3 3 ...
## $ sector : int NA NA NA NA NA NA 1 1 NA NA ...
## $ occupation : int NA NA NA NA NA NA 5 5 NA NA ...
## $ earn_mont : num 0 0 0 0 0 0 20 20 0 0 ...
## $ earn_ann : num 0 0 0 0 0 0 240 240 0 0 ...
## $ hours_week : num NA NA NA NA NA NA 30 35 NA NA ...
## $ hours_mnt : num NA NA NA NA NA ...
## $ fulltime : int NA NA NA NA NA NA 1 1 NA NA ...
## $ hhexp : num 100 100 100 343 343 ...
## $ legacy_pension_amt: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr ""
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 100 100 108 108 108 100 108 108 108 100 ...
## - attr(*, "val.labels")= chr "" "" "location" "region" ...
## - attr(*, "var.labels")= chr "hhid" "hhweight" "location" "region" ...
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr "_dta" "_svy_su1" "cluster"
## ..$ : chr "_dta" "_svy_strata1" "strata"
## ..$ : chr "_dta" "_svy_stages" "1"
## ..$ : chr "_dta" "_svy_version" "2"
## ..$ : chr "_dta" "__XijVarLabcons" "(sum) cons"
## ..$ : chr "_dta" "ReS_Xij" "cons"
## ..$ : chr "_dta" "ReS_str" "0"
## ..$ : chr "_dta" "ReS_j" "group"
## ..$ : chr "_dta" "ReS_ver" "v.2"
## ..$ : chr "_dta" "ReS_i" "hhid dur"
## ..$ : chr "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc, gall, health, rent, durables we"| __truncated__
## ..$ : chr "_dta" "note0" "1"
## - attr(*, "version")= int 7
## - attr(*, "label.table")=List of 12
## ..$ location: Named int 1 2
## .. ..- attr(*, "names")= chr "urban location" "rural location"
## ..$ region : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "Sofia city" "Bourgass" "Varna" "Lovetch" ...
## ..$ ethnic : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "Bulgaria" "Turks" "Roma" "Other"
## ..$ s2_q2 : Named int 1 2
## .. ..- attr(*, "names")= chr "male" "female"
## ..$ s2_q3 : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "head " "spouse/partner " "child " "son/daughter-in-law " ...
## ..$ lit : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## ..$ : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "never attanded" "primary" "secondary" "postsecondary"
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "Not unemployed" "Unemployed"
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "student" "housewife/childcare" "in retirement" "illness, disability" ...
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "agriculture" "mining" "manufacturing" "utilities" ...
## ..$ : Named int 1 2 3 4 5
## .. ..- attr(*, "names")= chr "private company" "public works program" "government,public sector, army" "private individual" ...
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
# Create and print structure of edu_equal_2
edu_equal_2 <- read.dta(path, convert.factors=F)
str(edu_equal_2)
## 'data.frame': 12214 obs. of 27 variables:
## $ hhid : num 1 1 1 2 2 3 4 4 5 6 ...
## $ hhweight : num 627 627 627 627 627 ...
## $ location : int 1 1 1 1 1 2 2 2 1 1 ...
## $ region : int 8 8 8 9 9 4 4 4 8 8 ...
## $ ethnicity_head : int 2 2 2 1 1 1 1 1 1 1 ...
## $ age : num 37 11 8 73 70 75 79 80 82 83 ...
## $ gender : int 2 2 1 1 2 1 1 2 2 2 ...
## $ relation : int 1 3 3 1 2 1 1 2 1 1 ...
## $ literate : int 1 2 2 2 2 2 2 2 2 2 ...
## $ income_mnt : num 13.3 13.3 13.3 142.5 142.5 ...
## $ income : num 160 160 160 1710 1710 ...
## $ aggregate : num 1042 1042 1042 3271 3271 ...
## $ aggr_ind_annual : num 347 347 347 1635 1635 ...
## $ educ_completed : int 2 4 4 4 3 3 3 3 4 4 ...
## $ grade_complete : num 4 3 0 3 4 4 4 4 5 5 ...
## $ grade_all : num 4 11 8 11 8 8 8 8 13 13 ...
## $ unemployed : int 2 1 1 1 1 1 1 1 1 1 ...
## $ reason_OLF : int NA NA NA 3 3 3 9 9 3 3 ...
## $ sector : int NA NA NA NA NA NA 1 1 NA NA ...
## $ occupation : int NA NA NA NA NA NA 5 5 NA NA ...
## $ earn_mont : num 0 0 0 0 0 0 20 20 0 0 ...
## $ earn_ann : num 0 0 0 0 0 0 240 240 0 0 ...
## $ hours_week : num NA NA NA NA NA NA 30 35 NA NA ...
## $ hours_mnt : num NA NA NA NA NA ...
## $ fulltime : int NA NA NA NA NA NA 1 1 NA NA ...
## $ hhexp : num 100 100 100 343 343 ...
## $ legacy_pension_amt: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr ""
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 100 100 108 108 108 100 108 108 108 100 ...
## - attr(*, "val.labels")= chr "" "" "location" "region" ...
## - attr(*, "var.labels")= chr "hhid" "hhweight" "location" "region" ...
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr "_dta" "_svy_su1" "cluster"
## ..$ : chr "_dta" "_svy_strata1" "strata"
## ..$ : chr "_dta" "_svy_stages" "1"
## ..$ : chr "_dta" "_svy_version" "2"
## ..$ : chr "_dta" "__XijVarLabcons" "(sum) cons"
## ..$ : chr "_dta" "ReS_Xij" "cons"
## ..$ : chr "_dta" "ReS_str" "0"
## ..$ : chr "_dta" "ReS_j" "group"
## ..$ : chr "_dta" "ReS_ver" "v.2"
## ..$ : chr "_dta" "ReS_i" "hhid dur"
## ..$ : chr "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc, gall, health, rent, durables we"| __truncated__
## ..$ : chr "_dta" "note0" "1"
## - attr(*, "version")= int 7
## - attr(*, "label.table")=List of 12
## ..$ location: Named int 1 2
## .. ..- attr(*, "names")= chr "urban location" "rural location"
## ..$ region : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "Sofia city" "Bourgass" "Varna" "Lovetch" ...
## ..$ ethnic : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "Bulgaria" "Turks" "Roma" "Other"
## ..$ s2_q2 : Named int 1 2
## .. ..- attr(*, "names")= chr "male" "female"
## ..$ s2_q3 : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "head " "spouse/partner " "child " "son/daughter-in-law " ...
## ..$ lit : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## ..$ : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "never attanded" "primary" "secondary" "postsecondary"
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "Not unemployed" "Unemployed"
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "student" "housewife/childcare" "in retirement" "illness, disability" ...
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "agriculture" "mining" "manufacturing" "utilities" ...
## ..$ : Named int 1 2 3 4 5
## .. ..- attr(*, "names")= chr "private company" "public works program" "government,public sector, army" "private individual" ...
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
# Create and print structure of edu_equal_3
edu_equal_3 <- read.dta(path, convert.underscore=T)
str(edu_equal_3)
## 'data.frame': 12214 obs. of 27 variables:
## $ hhid : num 1 1 1 2 2 3 4 4 5 6 ...
## $ hhweight : num 627 627 627 627 627 ...
## $ location : Factor w/ 2 levels "urban location",..: 1 1 1 1 1 2 2 2 1 1 ...
## $ region : Factor w/ 9 levels "Sofia city","Bourgass",..: 8 8 8 9 9 4 4 4 8 8 ...
## $ ethnicity.head : Factor w/ 4 levels "Bulgaria","Turks",..: 2 2 2 1 1 1 1 1 1 1 ...
## $ age : num 37 11 8 73 70 75 79 80 82 83 ...
## $ gender : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 1 2 2 2 ...
## $ relation : Factor w/ 9 levels "head ",..: 1 3 3 1 2 1 1 2 1 1 ...
## $ literate : Factor w/ 2 levels "no","yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ income.mnt : num 13.3 13.3 13.3 142.5 142.5 ...
## $ income : num 160 160 160 1710 1710 ...
## $ aggregate : num 1042 1042 1042 3271 3271 ...
## $ aggr.ind.annual : num 347 347 347 1635 1635 ...
## $ educ.completed : int 2 4 4 4 3 3 3 3 4 4 ...
## $ grade.complete : num 4 3 0 3 4 4 4 4 5 5 ...
## $ grade.all : num 4 11 8 11 8 8 8 8 13 13 ...
## $ unemployed : int 2 1 1 1 1 1 1 1 1 1 ...
## $ reason.OLF : int NA NA NA 3 3 3 9 9 3 3 ...
## $ sector : int NA NA NA NA NA NA 1 1 NA NA ...
## $ occupation : int NA NA NA NA NA NA 5 5 NA NA ...
## $ earn.mont : num 0 0 0 0 0 0 20 20 0 0 ...
## $ earn.ann : num 0 0 0 0 0 0 240 240 0 0 ...
## $ hours.week : num NA NA NA NA NA NA 30 35 NA NA ...
## $ hours.mnt : num NA NA NA NA NA ...
## $ fulltime : int NA NA NA NA NA NA 1 1 NA NA ...
## $ hhexp : num 100 100 100 343 343 ...
## $ legacy.pension.amt: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr ""
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 100 100 108 108 108 100 108 108 108 100 ...
## - attr(*, "val.labels")= chr "" "" "location" "region" ...
## - attr(*, "var.labels")= chr "hhid" "hhweight" "location" "region" ...
## - attr(*, "expansion.fields")=List of 12
## ..$ : chr "_dta" "_svy_su1" "cluster"
## ..$ : chr "_dta" "_svy_strata1" "strata"
## ..$ : chr "_dta" "_svy_stages" "1"
## ..$ : chr "_dta" "_svy_version" "2"
## ..$ : chr "_dta" "__XijVarLabcons" "(sum) cons"
## ..$ : chr "_dta" "ReS_Xij" "cons"
## ..$ : chr "_dta" "ReS_str" "0"
## ..$ : chr "_dta" "ReS_j" "group"
## ..$ : chr "_dta" "ReS_ver" "v.2"
## ..$ : chr "_dta" "ReS_i" "hhid dur"
## ..$ : chr "_dta" "note1" "variables g1pc, g2pc, g3pc, g4pc, g5pc, g7pc, g8pc, g9pc, g10pc, g11pc, g12pc, gall, health, rent, durables we"| __truncated__
## ..$ : chr "_dta" "note0" "1"
## - attr(*, "version")= int 7
## - attr(*, "label.table")=List of 12
## ..$ location: Named int 1 2
## .. ..- attr(*, "names")= chr "urban location" "rural location"
## ..$ region : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "Sofia city" "Bourgass" "Varna" "Lovetch" ...
## ..$ ethnic : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "Bulgaria" "Turks" "Roma" "Other"
## ..$ s2_q2 : Named int 1 2
## .. ..- attr(*, "names")= chr "male" "female"
## ..$ s2_q3 : Named int 1 2 3 4 5 6 7 8 9
## .. ..- attr(*, "names")= chr "head " "spouse/partner " "child " "son/daughter-in-law " ...
## ..$ lit : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## ..$ : Named int 1 2 3 4
## .. ..- attr(*, "names")= chr "never attanded" "primary" "secondary" "postsecondary"
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "Not unemployed" "Unemployed"
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "student" "housewife/childcare" "in retirement" "illness, disability" ...
## ..$ : Named int 1 2 3 4 5 6 7 8 9 10
## .. ..- attr(*, "names")= chr "agriculture" "mining" "manufacturing" "utilities" ...
## ..$ : Named int 1 2 3 4 5
## .. ..- attr(*, "names")= chr "private company" "public works program" "government,public sector, army" "private individual" ...
## ..$ : Named int 1 2
## .. ..- attr(*, "names")= chr "no" "yes"
## How many observations/individuals from Bulgaria have an income above 1000?
nrow(subset(edu_equal_1, ethnicity_head == 'Bulgaria' & income > 1000))
## [1] 8997
# Import international.sav as a data frame: demo
demo <- read.spss('http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/international.sav', to.data.frame=T)
# Create boxplot of gdp variable of demo
boxplot(demo$gdp)
## What is the correlation coefficient for the two numerical variables gdp and f_illit (female illiteracy rate)?
cor(demo$gdp, demo$f_illit)
## [1] -0.4476856
path <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/international.sav'
# Import international.sav as demo_1
demo_1 <- read.spss(path, to.data.frame=T)
# Print out the head of demo_1
head(demo_1)
## id country contint m_illit f_illit lifeexpt gdp
## 1 1 Argentina Americas 3.0 3.0 16 3375
## 2 2 Benin Africa 45.2 74.5 7 521
## 3 3 Burundi Africa 33.2 48.1 5 86
## 4 4 Chile Americas 4.2 4.4 14 4523
## 5 5 Dominican Republic Americas 12.0 12.7 12 2408
## 6 6 El Salvador Americas 17.6 22.9 11 2302
# Import international.sav as demo_2
demo_2 <- read.spss(path, to.data.frame=T, use.value.labels=F)
# Print out the head of demo_2
head(demo_2)
## id country contint m_illit f_illit lifeexpt gdp
## 1 1 Argentina 2 3.0 3.0 16 3375
## 2 2 Benin 1 45.2 74.5 7 521
## 3 3 Burundi 1 33.2 48.1 5 86
## 4 4 Chile 2 4.2 4.4 14 4523
## 5 5 Dominican Republic 2 12.0 12.7 12 2408
## 6 6 El Salvador 2 17.6 22.9 11 2302
str(demo_2)
## 'data.frame': 20 obs. of 7 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ country : Factor w/ 20 levels "Argentina ",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ contint : atomic 2 1 1 2 2 2 1 4 3 3 ...
## ..- attr(*, "value.labels")= Named chr "4" "3" "2" "1"
## .. ..- attr(*, "names")= chr "Europe" "Asia" "Americas" "Africa"
## $ m_illit : num 3 45.2 33.2 4.2 12 17.6 37.1 0.6 16.5 23 ...
## $ f_illit : num 3 74.5 48.1 4.4 12.7 22.9 54.3 0.7 29.6 39.1 ...
## $ lifeexpt: num 16 7 5 14 12 11 7 15 11 9 ...
## $ gdp : num 3375 521 86 4523 2408 ...
## - attr(*, "variable.labels")= Named chr " " "Country" "Continent" "Male Illiteracy Rate (%)" ...
## ..- attr(*, "names")= chr "id" "country" "contint" "m_illit" ...
## - attr(*, "codepage")= int 1252