This tutorial shows:
Import .sav SPSS files to R
Read variable attributes from the SPSS “variable view” table
Organize the variable coding to data frames
Fix reverse coding in survey data
Best-subset regression
Import SPSS files (.sav) to R
GunControl.Rmd
Sample data: GunControl
is a survey data set with about 1500 observations and 58 columns
library(haven) # read_sav
library(tidyverse)
library(dplyr)
library(psych)
library(knitr)
library(DT)
GunControl <- read_sav("GunControl.sav")
#glimpse(GunControl)
GunControl <- GunControl[GunControl$Finished==1,] # only keep the valid responses
Main Survey Questions
Use lapply
to release SPSS variable attributes
GunControl_labels <- c() # a text vector
for (i in 1: ncol(GunControl)) {
GunControl_labels[i] <- attr(GunControl[[i]],"label")
} # double brackets to query variable labels
`%!in%` <- Negate(`%in%`)
DV_col1 <- which(grepl("GUN_", names(GunControl))) # GUN law related columns (step 1)
DV_col1 <- DV_col1[!DV_col1 %in% which(grepl("_REC", names(GunControl)))] # removed recoded binary variables (step 2)
GunControl_labels[DV_col1] # The main questions
## [1] "Stricter gun laws in general"
## [2] "Requiring background checks for all gun buyers"
## [3] "A nationwide ban on the sale of assault weapons"
## [4] "A nationwide ban on the sale of guns to people who have been convicted of violent crimes"
## [5] "Stricter regulations on ammunition sales"
## [6] "Easier access to firearms"
## [7] "A ban on gun modifications that can make a semi-automatic gun work more like an automatic gun"
## [8] "Do you think it is too easy to buy a gun in the U.S. today, too difficult to buy a gun in the U.S. today, or about right?"
Variable Codebook
Release the “variable view” table in SPSS.
include_graphics("SPSS_variableview.png")

attributes
-> unlist
-> enframe
attributes(GunControl$GUN_DIFF) # query the attributes of an individual variable
## $label
## [1] "Do you think it is too easy to buy a gun in the U.S. today, too difficult to buy a gun in the U.S. today, or about right?"
##
## $format.spss
## [1] "F40.0"
##
## $display_width
## [1] 5
##
## $labels
## Too easy About right Too difficult
## 1 2 3
##
## $class
## [1] "haven_labelled" "vctrs_vctr" "double"
GUN_attr <- lapply(X = GunControl, FUN = attributes) # group-processed column attributes
GUN_codings <- vector(mode = "list", length(GUN_attr)) # empty list
for (i in 1: length(GUN_attr)) {
GUN_codings[i] <- GUN_attr[[i]][length(GUN_attr[[i]])-1]
} # observe the structure of the variable attribute, and then locate which element is the code
GUN_codings_unlisted <- lapply(X = GUN_codings, FUN = unlist) # interim list
GUN_codings_enframe <- lapply(X = GUN_codings_unlisted, FUN = enframe) # final product
names(GUN_codings_enframe) <- names(GunControl) # match variable names
GUN_codings_enframe$Finished # show the coding of one of the variables
## # A tibble: 5 × 2
## name value
## <chr> <dbl>
## 1 Didn't finish 0
## 2 Valid response 1
## 3 Didn't read questions 2
## 4 Bot 3
## 5 Not eligible- age or state 9
names(GUN_codings_enframe) <- seq(1:length(GUN_codings_enframe))
GUN_codings_rbind <- do.call(what = rbind, args = GUN_codings_enframe) # combine multiple variable coding dataframes to one dataframe
GUN_codings_rbind$key <- floor(as.numeric(row.names(GUN_codings_rbind)))
GunControl_labels <- data.frame(GunControl_labels)
GunControl_labels$key <- seq(1: nrow(GunControl_labels))
GunControl_labels$variable <- names(GunControl)
GUN_codebook <- merge(GunControl_labels, GUN_codings_rbind, by.x = "key", by.y = "key", all.x = T, all.y = T)
names(GUN_codebook) <- c("column#", "question", "variable", "response", "code")
GUN_codebook$response <- ifelse(nchar(GUN_codebook$response)<2, "", GUN_codebook$response) # some variables do not having meaningful codings, so remove the irrelevant non-coding information
GUN_codebook$code <- ifelse(nchar(GUN_codebook$response)<2, "", GUN_codebook$code) # some variables do not having meaningful codings, so remove the irrelevant non-coding information
## revised the code book data frame by aggregating the responses by each question
GUN_codebook_short <- GUN_codebook
GUN_codebook_short$responses <- ifelse(nchar(GUN_codebook_short$response)>1, paste(GUN_codebook_short$response, GUN_codebook_short$code, sep = " = "),"")
GUN_codebook_short <- aggregate(responses ~ `column#`+ variable + question, data = GUN_codebook_short, FUN = combine_words) # use combine_words over paste for ", "
GUN_codebook_short <- GUN_codebook_short[order(GUN_codebook_short$`column#`),]
Codebook V1
Each distinct response is in a separate row:
datatable(GUN_codebook[GUN_codebook$variable=="EDU",], rownames = F) # each distinct response is in a separate row
Codebook V2
Combine all responses for the same question into one row:
datatable(GUN_codebook_short[GUN_codebook_short$`column#` %in% DV_col1,], rownames = F) # combine all responses for the same question into one row
Group-Process Columns
Be aware that dataframe columns imported from SPSS files may not be ready for analyses. It is important to group-process the relevant columns using sapply
and as.numeric
GunControl <- as.data.frame(GunControl) # "tbl_df" "tbl" "data.frame" to "data.frame"
sapply(GunControl[,DV_col1],class) # check DV classes
## GUN_GUN_LAWS GUN_GUN_BGCHK GUN_GUN_ASSAULTBAN GUN_GUN_VIOLBAN
## [1,] "haven_labelled" "haven_labelled" "haven_labelled" "haven_labelled"
## [2,] "vctrs_vctr" "vctrs_vctr" "vctrs_vctr" "vctrs_vctr"
## [3,] "double" "double" "double" "double"
## GUN_GUN_AMMOREG GUN_GUN_EASYACCESS GUN_GUN_MOD GUN_DIFF
## [1,] "haven_labelled" "haven_labelled" "haven_labelled" "haven_labelled"
## [2,] "vctrs_vctr" "vctrs_vctr" "vctrs_vctr" "vctrs_vctr"
## [3,] "double" "double" "double" "double"
GunControl[,DV_col1] <- sapply(GunControl[,DV_col1],as.numeric) # change class for multiple variables, be cautious, it'll remove attributes from SPSS
DV_GUN <- GunControl[,DV_col1]
DV_GUN <- DV_GUN[complete.cases(DV_GUN),]
summary(DV_GUN) # The mean and median of two variables are much different from the others, potential reverse coding
## GUN_GUN_LAWS GUN_GUN_BGCHK GUN_GUN_ASSAULTBAN GUN_GUN_VIOLBAN
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.:4.000 1st Qu.:2.00 1st Qu.:4.000
## Median :4.000 Median :4.000 Median :4.00 Median :4.000
## Mean :3.109 Mean :3.782 Mean :3.13 Mean :3.731
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:4.000
## Max. :4.000 Max. :4.000 Max. :4.00 Max. :4.000
## GUN_GUN_AMMOREG GUN_GUN_EASYACCESS GUN_GUN_MOD GUN_DIFF
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000
## Median :3.00 Median :1.000 Median :4.000 Median :1.000
## Mean :3.09 Mean :1.732 Mean :3.334 Mean :1.375
## 3rd Qu.:4.00 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :4.00 Max. :4.000 Max. :4.000 Max. :3.000
Identify Reverse Codings
GUN_codebook_short$question[GUN_codebook_short$`column#` %in% DV_col1]
## [1] "Stricter gun laws in general"
## [2] "Requiring background checks for all gun buyers"
## [3] "A nationwide ban on the sale of assault weapons"
## [4] "A nationwide ban on the sale of guns to people who have been convicted of violent crimes"
## [5] "Stricter regulations on ammunition sales"
## [6] "Easier access to firearms"
## [7] "A ban on gun modifications that can make a semi-automatic gun work more like an automatic gun"
## [8] "Do you think it is too easy to buy a gun in the U.S. today, too difficult to buy a gun in the U.S. today, or about right?"
Correlation Table
## LAWS BGCHK ASSAULTBAN VIOLBAN AMMOREG EASYACCESS MOD DIFF
## LAWS 1.00 0.36 0.63 0.27 0.80 -0.51 0.50 -0.62
## BGCHK 0.36 1.00 0.37 0.45 0.37 -0.38 0.37 -0.41
## ASSAULTBAN 0.63 0.37 1.00 0.30 0.69 -0.48 0.59 -0.52
## VIOLBAN 0.27 0.45 0.30 1.00 0.28 -0.30 0.31 -0.26
## AMMOREG 0.80 0.37 0.69 0.28 1.00 -0.50 0.51 -0.59
## EASYACCESS -0.51 -0.38 -0.48 -0.30 -0.50 1.00 -0.39 0.53
## MOD 0.50 0.37 0.59 0.31 0.51 -0.39 1.00 -0.41
## DIFF -0.62 -0.41 -0.52 -0.26 -0.59 0.53 -0.41 1.00
library(corrplot)
## corrplot 0.92 loaded
corrplot(round(cor(DV_GUN),3), type = "upper", tl.col = "black", tl.srt = 45, tl.cex = .8)

GunControl$GENDER <- ifelse(GunControl$GENDER==1,"Male","Female")
GunControl$URBAN <- ifelse(GunControl$URBAN==1, "Urban", "Rural")
GunControl$EDU <- as.numeric(GunControl$EDU)
BEST SUBSET
#names(GunControl)
library(leaps) # best subset regression models
GUN.regsubset <- regsubsets(GUN_GUN_LAWS ~ AGE + EDU + INCOME + POL_VIEW+ GENDER+ URBAN, nbest = 3, data = GunControl)
cbind(summary(GUN.regsubset)$outmat,round(summary(GUN.regsubset)$adjr2,4), round(summary(GUN.regsubset)$cp,2))
## AGE EDU INCOME POL_VIEW GENDERMale URBANUrban
## 1 ( 1 ) " " " " " " "*" " " " " "0.1155" "45.23"
## 1 ( 2 ) " " " " " " " " " " "*" "0.0257" "158.43"
## 1 ( 3 ) " " " " " " " " "*" " " "0.023" "161.76"
## 2 ( 1 ) " " " " " " "*" "*" " " "0.1332" "23.84"
## 2 ( 2 ) " " " " " " "*" " " "*" "0.1319" "25.52"
## 2 ( 3 ) " " "*" " " "*" " " " " "0.1152" "46.55"
## 3 ( 1 ) " " " " " " "*" "*" "*" "0.1515" "1.74"
## 3 ( 2 ) " " "*" " " "*" "*" " " "0.1333" "24.72"
## 3 ( 3 ) "*" " " " " "*" "*" " " "0.1331" "25"
## 4 ( 1 ) " " "*" " " "*" "*" "*" "0.1511" "3.3"
## 4 ( 2 ) "*" " " " " "*" "*" "*" "0.151" "3.43"
## 4 ( 3 ) " " " " "*" "*" "*" "*" "0.1508" "3.62"
## 5 ( 1 ) "*" "*" " " "*" "*" "*" "0.1505" "5.01"
## 5 ( 2 ) " " "*" "*" "*" "*" "*" "0.1503" "5.28"
## 5 ( 3 ) "*" " " "*" "*" "*" "*" "0.1503" "5.33"
## 6 ( 1 ) "*" "*" "*" "*" "*" "*" "0.1497" "7"
summary(lm(GUN_GUN_LAWS ~ GENDER+ POL_VIEW + URBAN, data = GunControl))
##
## Call:
## lm(formula = GUN_GUN_LAWS ~ GENDER + POL_VIEW + URBAN, data = GunControl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6897 -0.7541 0.2459 0.6204 1.8647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.13462 0.09280 23.004 < 2e-16 ***
## GENDERMale -0.30872 0.05841 -5.285 1.5e-07 ***
## POL_VIEW 0.30940 0.02642 11.710 < 2e-16 ***
## URBANUrban 0.31680 0.06307 5.023 5.9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.984 on 1137 degrees of freedom
## (251 observations deleted due to missingness)
## Multiple R-squared: 0.1548, Adjusted R-squared: 0.1525
## F-statistic: 69.4 on 3 and 1137 DF, p-value: < 2.2e-16