This tutorial shows:

  1. Import .sav SPSS files to R

  2. Read variable attributes from the SPSS “variable view” table

  3. Organize the variable coding to data frames

  4. Fix reverse coding in survey data

  5. Best-subset regression

Import SPSS files (.sav) to R

GunControl.Rmd

Sample data: GunControl is a survey data set with about 1500 observations and 58 columns

library(haven) # read_sav
library(tidyverse)
library(dplyr)
library(psych)
library(knitr)
library(DT)
GunControl <- read_sav("GunControl.sav")
#glimpse(GunControl)
GunControl <- GunControl[GunControl$Finished==1,] # only keep the valid responses

Main Survey Questions

Use lapply to release SPSS variable attributes

GunControl_labels <- c() # a text vector
for (i in 1: ncol(GunControl)) {
  GunControl_labels[i] <- attr(GunControl[[i]],"label")
} # double brackets to query variable labels
`%!in%` <- Negate(`%in%`)
DV_col1 <- which(grepl("GUN_", names(GunControl))) # GUN law related columns (step 1)
DV_col1 <- DV_col1[!DV_col1 %in% which(grepl("_REC", names(GunControl)))] # removed recoded binary variables (step 2)
GunControl_labels[DV_col1] # The main questions
## [1] "Stricter gun laws in general"                                                                                             
## [2] "Requiring background checks for all gun buyers"                                                                           
## [3] "A nationwide ban on the sale of assault weapons"                                                                          
## [4] "A nationwide ban on the sale of guns to people who have been convicted of violent crimes"                                 
## [5] "Stricter regulations on ammunition sales"                                                                                 
## [6] "Easier access to firearms"                                                                                                
## [7] "A ban on gun modifications that can make a semi-automatic gun work more like an automatic gun"                            
## [8] "Do you think it is too easy to buy a gun in the U.S. today, too difficult to buy a gun in the U.S. today, or about right?"

Variable Codebook

Release the “variable view” table in SPSS.

include_graphics("SPSS_variableview.png")

attributes -> unlist -> enframe

attributes(GunControl$GUN_DIFF) # query the attributes of an individual variable
## $label
## [1] "Do you think it is too easy to buy a gun in the U.S. today, too difficult to buy a gun in the U.S. today, or about right?"
## 
## $format.spss
## [1] "F40.0"
## 
## $display_width
## [1] 5
## 
## $labels
##      Too easy   About right Too difficult 
##             1             2             3 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"
GUN_attr <- lapply(X = GunControl, FUN = attributes) # group-processed column attributes
GUN_codings <- vector(mode = "list", length(GUN_attr)) # empty list
for (i in 1: length(GUN_attr)) {
  GUN_codings[i] <- GUN_attr[[i]][length(GUN_attr[[i]])-1]
  } # observe the structure of the variable attribute, and then locate which element is the code
GUN_codings_unlisted <- lapply(X = GUN_codings, FUN = unlist) # interim list
GUN_codings_enframe <- lapply(X = GUN_codings_unlisted, FUN = enframe) # final product
names(GUN_codings_enframe) <- names(GunControl) # match variable names
GUN_codings_enframe$Finished # show the coding of one of the variables
## # A tibble: 5 × 2
##   name                       value
##   <chr>                      <dbl>
## 1 Didn't finish                  0
## 2 Valid response                 1
## 3 Didn't read questions          2
## 4 Bot                            3
## 5 Not eligible- age or state     9
names(GUN_codings_enframe) <- seq(1:length(GUN_codings_enframe))
GUN_codings_rbind <- do.call(what = rbind, args = GUN_codings_enframe) # combine multiple variable coding dataframes to one dataframe
GUN_codings_rbind$key <- floor(as.numeric(row.names(GUN_codings_rbind)))
GunControl_labels <- data.frame(GunControl_labels)
GunControl_labels$key <- seq(1: nrow(GunControl_labels))
GunControl_labels$variable <- names(GunControl)
GUN_codebook <- merge(GunControl_labels, GUN_codings_rbind, by.x = "key", by.y = "key", all.x = T, all.y = T)
names(GUN_codebook) <- c("column#", "question", "variable", "response", "code")
GUN_codebook$response <- ifelse(nchar(GUN_codebook$response)<2, "", GUN_codebook$response) # some variables do not having meaningful codings, so remove the irrelevant non-coding information
GUN_codebook$code <- ifelse(nchar(GUN_codebook$response)<2, "", GUN_codebook$code) # some variables do not having meaningful codings, so remove the irrelevant non-coding information
## revised the code book data frame by aggregating the responses by each question
GUN_codebook_short <- GUN_codebook
GUN_codebook_short$responses <- ifelse(nchar(GUN_codebook_short$response)>1, paste(GUN_codebook_short$response, GUN_codebook_short$code, sep = " = "),"")
GUN_codebook_short <- aggregate(responses ~ `column#`+ variable + question, data = GUN_codebook_short, FUN = combine_words) # use combine_words over paste for ", "
GUN_codebook_short <- GUN_codebook_short[order(GUN_codebook_short$`column#`),]

Codebook V1

Each distinct response is in a separate row:

datatable(GUN_codebook[GUN_codebook$variable=="EDU",], rownames = F) # each distinct response is in a separate row

Codebook V2

Combine all responses for the same question into one row:

datatable(GUN_codebook_short[GUN_codebook_short$`column#` %in% DV_col1,], rownames = F) # combine all responses for the same question into one row

Group-Process Columns

Be aware that dataframe columns imported from SPSS files may not be ready for analyses. It is important to group-process the relevant columns using sapply and as.numeric

GunControl <- as.data.frame(GunControl) # "tbl_df"     "tbl"        "data.frame" to "data.frame"
sapply(GunControl[,DV_col1],class) # check DV classes
##      GUN_GUN_LAWS     GUN_GUN_BGCHK    GUN_GUN_ASSAULTBAN GUN_GUN_VIOLBAN 
## [1,] "haven_labelled" "haven_labelled" "haven_labelled"   "haven_labelled"
## [2,] "vctrs_vctr"     "vctrs_vctr"     "vctrs_vctr"       "vctrs_vctr"    
## [3,] "double"         "double"         "double"           "double"        
##      GUN_GUN_AMMOREG  GUN_GUN_EASYACCESS GUN_GUN_MOD      GUN_DIFF        
## [1,] "haven_labelled" "haven_labelled"   "haven_labelled" "haven_labelled"
## [2,] "vctrs_vctr"     "vctrs_vctr"       "vctrs_vctr"     "vctrs_vctr"    
## [3,] "double"         "double"           "double"         "double"
GunControl[,DV_col1] <- sapply(GunControl[,DV_col1],as.numeric) # change class for multiple variables, be cautious, it'll remove attributes from SPSS
DV_GUN <- GunControl[,DV_col1]
DV_GUN <- DV_GUN[complete.cases(DV_GUN),]
summary(DV_GUN) # The mean and median of two variables are much different from the others, potential reverse coding
##   GUN_GUN_LAWS   GUN_GUN_BGCHK   GUN_GUN_ASSAULTBAN GUN_GUN_VIOLBAN
##  Min.   :1.000   Min.   :1.000   Min.   :1.00       Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:4.000   1st Qu.:2.00       1st Qu.:4.000  
##  Median :4.000   Median :4.000   Median :4.00       Median :4.000  
##  Mean   :3.109   Mean   :3.782   Mean   :3.13       Mean   :3.731  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00       3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000   Max.   :4.00       Max.   :4.000  
##  GUN_GUN_AMMOREG GUN_GUN_EASYACCESS  GUN_GUN_MOD       GUN_DIFF    
##  Min.   :1.00    Min.   :1.000      Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.00    1st Qu.:1.000      1st Qu.:3.000   1st Qu.:1.000  
##  Median :3.00    Median :1.000      Median :4.000   Median :1.000  
##  Mean   :3.09    Mean   :1.732      Mean   :3.334   Mean   :1.375  
##  3rd Qu.:4.00    3rd Qu.:2.000      3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :4.00    Max.   :4.000      Max.   :4.000   Max.   :3.000

Identify Reverse Codings

GUN_codebook_short$question[GUN_codebook_short$`column#` %in% DV_col1]
## [1] "Stricter gun laws in general"                                                                                             
## [2] "Requiring background checks for all gun buyers"                                                                           
## [3] "A nationwide ban on the sale of assault weapons"                                                                          
## [4] "A nationwide ban on the sale of guns to people who have been convicted of violent crimes"                                 
## [5] "Stricter regulations on ammunition sales"                                                                                 
## [6] "Easier access to firearms"                                                                                                
## [7] "A ban on gun modifications that can make a semi-automatic gun work more like an automatic gun"                            
## [8] "Do you think it is too easy to buy a gun in the U.S. today, too difficult to buy a gun in the U.S. today, or about right?"

Correlation Table

##             LAWS BGCHK ASSAULTBAN VIOLBAN AMMOREG EASYACCESS   MOD  DIFF
## LAWS        1.00  0.36       0.63    0.27    0.80      -0.51  0.50 -0.62
## BGCHK       0.36  1.00       0.37    0.45    0.37      -0.38  0.37 -0.41
## ASSAULTBAN  0.63  0.37       1.00    0.30    0.69      -0.48  0.59 -0.52
## VIOLBAN     0.27  0.45       0.30    1.00    0.28      -0.30  0.31 -0.26
## AMMOREG     0.80  0.37       0.69    0.28    1.00      -0.50  0.51 -0.59
## EASYACCESS -0.51 -0.38      -0.48   -0.30   -0.50       1.00 -0.39  0.53
## MOD         0.50  0.37       0.59    0.31    0.51      -0.39  1.00 -0.41
## DIFF       -0.62 -0.41      -0.52   -0.26   -0.59       0.53 -0.41  1.00
library(corrplot)
## corrplot 0.92 loaded
corrplot(round(cor(DV_GUN),3), type = "upper", tl.col = "black", tl.srt = 45, tl.cex = .8)

GunControl$GENDER <- ifelse(GunControl$GENDER==1,"Male","Female")
GunControl$URBAN <- ifelse(GunControl$URBAN==1, "Urban", "Rural")
GunControl$EDU <- as.numeric(GunControl$EDU)

BEST SUBSET

#names(GunControl)
library(leaps) # best subset regression models
GUN.regsubset <- regsubsets(GUN_GUN_LAWS ~ AGE + EDU + INCOME + POL_VIEW+ GENDER+ URBAN, nbest = 3, data = GunControl)
cbind(summary(GUN.regsubset)$outmat,round(summary(GUN.regsubset)$adjr2,4), round(summary(GUN.regsubset)$cp,2))
##          AGE EDU INCOME POL_VIEW GENDERMale URBANUrban                  
## 1  ( 1 ) " " " " " "    "*"      " "        " "        "0.1155" "45.23" 
## 1  ( 2 ) " " " " " "    " "      " "        "*"        "0.0257" "158.43"
## 1  ( 3 ) " " " " " "    " "      "*"        " "        "0.023"  "161.76"
## 2  ( 1 ) " " " " " "    "*"      "*"        " "        "0.1332" "23.84" 
## 2  ( 2 ) " " " " " "    "*"      " "        "*"        "0.1319" "25.52" 
## 2  ( 3 ) " " "*" " "    "*"      " "        " "        "0.1152" "46.55" 
## 3  ( 1 ) " " " " " "    "*"      "*"        "*"        "0.1515" "1.74"  
## 3  ( 2 ) " " "*" " "    "*"      "*"        " "        "0.1333" "24.72" 
## 3  ( 3 ) "*" " " " "    "*"      "*"        " "        "0.1331" "25"    
## 4  ( 1 ) " " "*" " "    "*"      "*"        "*"        "0.1511" "3.3"   
## 4  ( 2 ) "*" " " " "    "*"      "*"        "*"        "0.151"  "3.43"  
## 4  ( 3 ) " " " " "*"    "*"      "*"        "*"        "0.1508" "3.62"  
## 5  ( 1 ) "*" "*" " "    "*"      "*"        "*"        "0.1505" "5.01"  
## 5  ( 2 ) " " "*" "*"    "*"      "*"        "*"        "0.1503" "5.28"  
## 5  ( 3 ) "*" " " "*"    "*"      "*"        "*"        "0.1503" "5.33"  
## 6  ( 1 ) "*" "*" "*"    "*"      "*"        "*"        "0.1497" "7"
summary(lm(GUN_GUN_LAWS ~ GENDER+ POL_VIEW + URBAN, data = GunControl))
## 
## Call:
## lm(formula = GUN_GUN_LAWS ~ GENDER + POL_VIEW + URBAN, data = GunControl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6897 -0.7541  0.2459  0.6204  1.8647 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.13462    0.09280  23.004  < 2e-16 ***
## GENDERMale  -0.30872    0.05841  -5.285  1.5e-07 ***
## POL_VIEW     0.30940    0.02642  11.710  < 2e-16 ***
## URBANUrban   0.31680    0.06307   5.023  5.9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.984 on 1137 degrees of freedom
##   (251 observations deleted due to missingness)
## Multiple R-squared:  0.1548, Adjusted R-squared:  0.1525 
## F-statistic:  69.4 on 3 and 1137 DF,  p-value: < 2.2e-16