library(reshape)
library(haven)
library(foreign)
library(readr)
library(dplyr)
library(ggplot2)
library(broom)
library(car)
library(MASS)
library(lmtest)
library(zoo)
library(nortest)
library(plotrix)
library(scales)
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
library(VGAM)
library(mice)
library(stargazer)
library(sandwich)
library(pastecs)
library(muhaz)
library(ggpubr)
library(eha)
library(reshape2)
library(data.table)
library(magrittr)
library(tidyverse)
library(sjmisc)
library(sjmisc)
library(sjlabelled)
library(GGally)
library(tigris)
library(RColorBrewer)
library(patchwork)
library(tidycensus)
library(censusapi)
library(wk)
library(spdep)
library(classInt)
library(sf)
library(spatialreg)
library(haven)
library(lme4)
library(jtools)
library(ggstance)
library(effects)
library(eha)
library(discSurv)
library(brms)
library(sensemakr)
library(survminer)
library(sjPlot)
library(weights)
library(gtsummary)
library(srvyr)
library(devtools)
library(vtable)
library(DBI)
library(RSQLite)
library(sqldf)
In this RMD, I assess socioeconomic and demographic characteristics of the labor force using a Public Use Microdata (PUMS) database provided for instruction. I use SQL syntax and dplyr to acheive this.
Load data
yo<-dbConnect(SQLite(),
dbname = "C://Users//Jaire//OneDrive//Desktop//Exploratory Research//SQL//PUMS_data")
Components of database
dbListTables(yo)
## [1] "pums_hh_vars_csv" "pums_tx_ca" "pums_tx_ca_2019"
## [4] "pums_tx_ca_hh" "pums_tx_ca_hh_2019" "pums_vars_csv"
Create dplyr objects
individual <- tbl(yo, "pums_tx_ca")
household <- tbl(yo, "pums_tx_ca_hh")
Number of rows in table
dbGetQuery(yo,
'SELECT COUNT(*)
FROM pums_tx_ca;')
## COUNT(*)
## 1 625899
Table w/ variables, 10 rows
dbGetQuery(yo,
'SELECT *
FROM pums_tx_ca
LIMIT 10;')
Select demographics
dbGetQuery(yo,
'SELECT AGEP, SEX, HISP, ST
FROM pums_tx_ca
LIMIT 10;')
## AGEP SEX HISP ST
## 1 42 1 1 48
## 2 35 2 1 48
## 3 9 2 1 48
## 4 8 2 1 48
## 5 0 2 1 48
## 6 27 2 2 6
## 7 33 2 2 6
## 8 24 2 2 6
## 9 55 1 1 48
## 10 56 2 2 48
Check for missing values in AGEP
dbGetQuery(yo,
'SELECT count(*)
FROM pums_tx_ca
WHERE AGEP IS NULL;')
## count(*)
## 1 0
Ages shared by multiple residents
dbGetQuery(yo,
'SELECT DISTINCT(AGEP)
FROM pums_tx_ca;')
Number of different ages across residents
dbGetQuery(yo,
'SELECT COUNT ( DISTINCT ( AGEP ) )
FROM pums_tx_ca;')
## COUNT ( DISTINCT ( AGEP ) )
## 1 93
Individuals 15 to 64 years old
dbGetQuery(yo,
'SELECT *
FROM pums_tx_ca
WHERE AGEP >=15 OR AGEP <=64
LIMIT 10;')
Create variables for residents who are not of working age (<15,64<)
Younger than 15
#dbExecute(yo,
#'ALTER TABLE pums_tx_ca
#ADD under15 BOOL;')
dbExecute(yo,
'UPDATE pums_tx_ca SET under15 = 0;')
## [1] 625899
dbExecute(yo,
'UPDATE pums_tx_ca SET under15 = 1 WHERE AGEP <15;')
## [1] 111479
Using dplyr: younger than 15
individual %>%
select(SERIALNO, AGEP)%>%
mutate(under15 = ifelse(AGEP <15, "Under 15", "15+"))%>%
show_query()
## <SQL>
## SELECT
## `SERIALNO`,
## `AGEP`,
## CASE WHEN (`AGEP` < 15.0) THEN 'Under 15' WHEN NOT (`AGEP` < 15.0) THEN '15+' END AS `under15`
## FROM `pums_tx_ca`
65 or older variable
#dbExecute(yo,
#'ALTER TABLE pums_tx_ca
#ADD over64 BOOL;')
dbExecute(yo,
'UPDATE pums_tx_ca SET over64 = 0;')
## [1] 625899
dbExecute(yo,
'UPDATE pums_tx_ca SET over64 = 1 WHERE AGEP > 64;')
## [1] 114246
Using dplyr: 65 or older variable
individual %>%
select(SERIALNO, AGEP)%>%
mutate(over64 = ifelse(AGEP >64, "65+", "64 or younger"))%>%
show_query()
## <SQL>
## SELECT
## `SERIALNO`,
## `AGEP`,
## CASE WHEN (`AGEP` > 64.0) THEN '65+' WHEN NOT (`AGEP` > 64.0) THEN '64 or younger' END AS `over64`
## FROM `pums_tx_ca`
Verify variables and AGEP
dbGetQuery(yo,
'SELECT under15, over64, AGEP
FROM pums_tx_ca
LIMIT 10;')
## under15 over64 AGEP
## 1 0 0 42
## 2 0 0 35
## 3 1 0 9
## 4 1 0 8
## 5 1 0 0
## 6 0 0 27
## 7 0 0 33
## 8 0 0 24
## 9 0 0 55
## 10 0 0 56
Using dplyr: state, wages, and age group
individual %>% select(SERIALNO, ST, WAGP, under15, over64)%>%
filter(WAGP!=0) %>%
head(n =10)
## # Source: SQL [10 x 5]
## # Database: sqlite 3.39.4 [C:\Users\Jaire\OneDrive\Desktop\Exploratory Research\SQL\PUMS_data]
## SERIALNO ST WAGP under15 over64
## <chr> <int> <int> <int> <int>
## 1 2019HU0000003 48 75000 0 0
## 2 2019HU0000003 48 -1 1 0
## 3 2019HU0000003 48 -1 1 0
## 4 2019HU0000003 48 -1 1 0
## 5 2019HU0000016 6 34800 0 0
## 6 2019HU0000023 48 38000 0 0
## 7 2019HU0000026 6 28800 0 0
## 8 2019HU0000026 6 -1 1 0
## 9 2019HU0000026 6 -1 1 0
## 10 2019HU0000026 6 25900 0 0
Check distributions of new variables
dbGetQuery(yo,
'SELECT under15, COUNT(under15)
FROM pums_tx_ca
GROUP BY under15
ORDER BY COUNT(under15);')
## under15 COUNT(under15)
## 1 1 111479
## 2 0 514420
dbGetQuery(yo,
'SELECT over64, COUNT(over64)
FROM pums_tx_ca
GROUP BY over64
ORDER BY COUNT(over64);')
## over64 COUNT(over64)
## 1 1 114246
## 2 0 511653
Number of residents under 15 (1) and over 15 (0) by state (48) TEXAS & (6) CALIFORNIA
dbGetQuery(yo,
'SELECT under15,ST, COUNT(*)
FROM pums_tx_ca
GROUP BY under15, ST
ORDER BY under15 DESC;')
## under15 ST COUNT(*)
## 1 1 6 61754
## 2 1 48 49725
## 3 0 6 302245
## 4 0 48 212175
Using dplyr: state, wage (mean, median), residents under 30
individual %>% select( SERIALNO, ST, WAGP,AGEP)%>%
filter(AGEP >=15, WAGP !=0) %>%
mutate(under30 = ifelse(AGEP <30, 1, 0))%>%
group_by(ST,under30)%>%
summarise(meanwage = mean(WAGP),
medianwage = median(WAGP))%>%
mutate(diff = meanwage - medianwage)
## # Source: SQL [4 x 5]
## # Database: sqlite 3.39.4 [C:\Users\Jaire\OneDrive\Desktop\Exploratory Research\SQL\PUMS_data]
## # Groups: ST
## ST under30 meanwage medianwage diff
## <int> <dbl> <dbl> <int> <dbl>
## 1 6 0 75924. 51000 24924.
## 2 6 1 30628. 20300 10328.
## 3 48 0 64068. 45000 19068.
## 4 48 1 26442. 20000 6442.
Number of individuals that will transfer back into the dependent population within a year by state (48) TEXAS & (6) CALIFORNIA
dbGetQuery(yo,
'SELECT AGEP,ST, COUNT(*)
FROM pums_tx_ca
GROUP BY AGEP, ST
HAVING AGEP = 64
ORDER BY AGEP DESC;')
## AGEP ST COUNT(*)
## 1 64 6 4741
## 2 64 48 3589
Over and under 64 by Hispanic ethnicity, race, count, and average age
dbGetQuery(yo,
'SELECT over64, HISP, RAC1P, COUNT(over64), AVG(AGEP)
FROM pums_tx_ca
WHERE WAGP > 0
GROUP BY over64, SEX_label
ORDER BY over64;')
## over64 HISP RAC1P COUNT(over64) AVG(AGEP)
## 1 0 2 8 135223 40.61819
## 2 0 1 6 148121 40.64538
## 3 1 1 2 10500 69.62495
## 4 1 1 1 12311 69.97173
Left join individual data with household data
dbGetQuery(yo,
'SELECT * FROM pums_tx_ca x
LEFT JOIN pums_tx_ca_hh y
ON x.SERIALNO = y.SERIALNO
LIMIT 100
;')
Using dyplr: right Join with query read out
individual %>%
mutate(over64 = ifelse(AGEP >64, "over 64", "64 and under"))%>%
select(SERIALNO, AGEP, WAGP, SEX, HISP, over65)%>%
right_join(household, by = 'SERIALNO' ) %>%
head()%>%
show_query()
## <SQL>
## SELECT *
## FROM (
## SELECT
## `RHS`.`SERIALNO` AS `SERIALNO`,
## `AGEP`,
## `WAGP`,
## `SEX`,
## `HISP`,
## `over65`,
## `SPORDER`,
## `WGTP`,
## `PWGTP`,
## `NP`,
## `VALP`,
## `ST`,
## `TYPE`,
## `ACCESS`,
## `HISPEED`,
## `TEN`,
## `ST_label`,
## `TYPE_label`,
## `ACCESS_label`,
## `HISPEED_label`,
## `TEN_label`
## FROM `pums_tx_ca_hh` AS `RHS`
## LEFT JOIN (
## SELECT `SERIALNO`, `AGEP`, `WAGP`, `SEX`, `HISP`, `over65`
## FROM `pums_tx_ca`
## ) AS `LHS`
## ON (`RHS`.`SERIALNO` = `LHS`.`SERIALNO`)
## )
## LIMIT 6
Using dyplr: graphic representation of select demographic characteristics
individual %>%
select(SERIALNO, AGEP, ST,HISP, WAGP)%>%
filter(AGEP > 15, WAGP !=0) %>%
mutate(over64 = ifelse(AGEP >64, 1, 0),
Ethnicity = ifelse(HISP ==1 , "Hispanic", "non-Hispanic"))%>%
group_by(ST, over64, Ethnicity,AGEP)%>%
summarise(meanwage = mean(WAGP),
medianwage = median(WAGP),
samp = n()) %>%
ggplot()+
aes(x = AGEP,
y = medianwage,
fill=factor(over64))+
geom_bar(stat="identity")+
facet_wrap(~Ethnicity) +
scale_y_continuous(labels = scales::label_dollar())
## `summarise()` has grouped output by "ST", "over64", and "Ethnicity". You can
## override using the `.groups` argument.
The graphic shows median wage by age among Hispanics and non-Hispanics, and shows the contrast between individuals under 64 years of age (red) and over 64 years of age (blue).
Store query data
savedquery <- individual %>%
select(SERIALNO, AGEP, ST,HISP, WAGP)%>%
filter(AGEP > 15, WAGP !=0) %>%
mutate(over64 = ifelse(AGEP >64, 1, 0),
Ethnicity = ifelse(HISP ==1 , "Hispanic", "non-Hispanic"))%>%
group_by(ST, over64, Ethnicity,AGEP)%>%
summarise(meanwage = mean(WAGP),
medianwage = median(WAGP),
samp = n()) %>%
collect()
## `summarise()` has grouped output by "ST", "over64", and "Ethnicity". You can
## override using the `.groups` argument.
Create csv
write.csv(savedquery, file = "savedquery.csv")
head(savedquery)
## # A tibble: 6 × 7
## # Groups: ST, over64, Ethnicity [1]
## ST over64 Ethnicity AGEP meanwage medianwage samp
## <int> <dbl> <chr> <int> <dbl> <dbl> <int>
## 1 6 0 Hispanic 16 2959. 1400 411
## 2 6 0 Hispanic 17 3804. 2000 700
## 3 6 0 Hispanic 18 6297. 3000 851
## 4 6 0 Hispanic 19 9420. 5600 1112
## 5 6 0 Hispanic 20 11942. 8400 1327
## 6 6 0 Hispanic 21 15064. 10000 1508
dbDisconnect(yo)