Load in your packages

These can also be found on the right side menu under “Packages”. Packages are made by other R users and are groups of functions that make using R easier. There are certain packages we tend to use more often than not so it’s usually good to start with loading these packages. If you don’t have these in your library, you will have to click the “install” button in the packages menu.

library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.2     ✓ purrr   0.3.4
✓ tibble  3.0.3     ✓ dplyr   1.0.0
✓ tidyr   1.1.0     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
package ‘ggplot2’ was built under R version 3.6.2package ‘tibble’ was built under R version 3.6.2package ‘tidyr’ was built under R version 3.6.2package ‘purrr’ was built under R version 3.6.2package ‘dplyr’ was built under R version 3.6.2── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(psych)

Attaching package: ‘psych’

The following objects are masked from ‘package:ggplot2’:

    %+%, alpha
library(haven)
package ‘haven’ was built under R version 3.6.2

Load in the Data

There are lots of ways to load data into R. But here is an easy one: go to “Files” on the right side, and select the file you would like to use, click the file and “import dataset”. Copy and paste the code preview into R notebook (or just click import dataset). View and glimpse are great ways to get an initial feel for the structure of your data.

glimpse(gss)
Rows: 2,765
Columns: 16
$ id       <dbl> 2331, 2003, 1221, 2051, 2465, 546, 1291, 732, 303, 2700, 855, 6…
$ hrs1     <dbl+lbl> NA, NA, NA, NA, 50, 60, 40, 25, NA, 40, 64, 45, 60, 85, NA,…
$ marital  <dbl+lbl> 1, 3, 1, 1, 1, 1, 5, 2, 1, 1, 1, 1, 1, 1, 2, 5, 1, 1, 5, 5,…
$ childs   <dbl+lbl> 3, 8, 3, 2, 0, 0, 0, 3, 3, 2, 3, 3, 2, 3, 6, 0, 4, 2, 0, 0,…
$ age      <dbl+lbl> 71, 69, 40, 60, 31, 37, 23, 86, 70, 42, 41, 30, 43, 48, 70,…
$ educ     <dbl+lbl> 18, 11, 19, 13, 11, 19, 11, 11, 13, 12, 12, 12, 13, 20, 12,…
$ sex      <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ polviews <dbl+lbl>  4,  4, NA,  5,  4,  2,  1,  4,  4,  3,  6, NA,  5,  5, NA,…
$ wwwhr    <dbl+lbl> NA, NA,  7,  1,  0,  3, NA, NA, NA,  3,  5, 20,  1,  2, NA,…
$ trustpeo <dbl+lbl>  4,  1, NA,  1,  2,  2, NA,  1,  4,  1,  2, NA,  5,  2, NA,…
$ wantbest <dbl+lbl>  2,  4, NA,  2,  2,  4, NA,  4,  1,  2,  2, NA,  1,  3, NA,…
$ advantge <dbl+lbl>  3,  2, NA,  1,  2,  2, NA,  2,  2,  2,  1, NA,  4,  4, NA,…
$ goodlife <dbl+lbl>  4, NA, NA, NA, NA,  1,  2,  2, NA, NA, NA, NA,  3, NA, NA,…
$ deckids  <dbl+lbl> NA, NA,  4, NA, NA, NA, NA, NA, NA, NA, NA,  3, NA, NA, NA,…
$ strsswrk <dbl+lbl> NA, NA,  5, NA, NA, NA, NA, NA, NA, NA, NA,  2, NA, NA,  3,…
$ satjob7  <dbl+lbl> NA, NA,  3, NA, NA, NA, NA, NA, NA, NA, NA,  3, NA, NA, NA,…

Wondering about variable labels? The str function can help…

str(gss$marital)
 dbl+lbl [1:2765] 1, 3, 1, 1, 1, 1, 5, 2, 1, 1, 1, 1, 1, 1, 2, 5, 1, 1, 5, 5, ...
 @ label       : chr "marital status"
 @ format.stata: chr "%16.0g"
 @ labels      : Named num [1:6] 1 2 3 4 5 9
  ..- attr(*, "names")= chr [1:6] "married" "widowed" "divorced" "separated" ...

Basic Exploratory Tools: Descriptive Statistics, Tabulations, and Histograms

Descriptive Statistics Using the describe Function from the psych Package

Tabulations with the table command

For categorical variables, the table command is a workhorse for data exploration. The table command allows you to examine frequencies for different levels of categorical variables.

table(gss$childs)

  0   1   2   3   4   5   6   7   8 
799 469 657 481 185  73  40  22  34 

A frequency historgram is a nice “quick and dirty” way to explore a continuous variable. The histogram function comes from the ggplot2 package. There are different ways to create a histogram in Rstudio. Here is the easiest way, again using the childs variable:

ggplot(data = gss, mapping = aes(x = childs)) + geom_bar()

Here is a fancier version (say, for publication) with a title and labels added with the labs function:

ggplot(data = gss, mapping = aes(x = childs)) + geom_bar() +
        labs(title = "Distribution of Number of Children per Family",
             x = "Number of Children",
             caption = "Data from the General Social Survey (2012). N = 2,765.")

LS0tCnRpdGxlOiAiTXVsdGl2YXJpYXRlIFN0YXRpc3RpY3MsIE1vZHVsZSAxOiBHZXR0aW5nIFN0YXJ0ZWQgd2l0aCBSLCBEZXNjcmlwdGl2ZSBTdGF0aXN0aWNzLCBhbmQgUGxvdHMiCmF1dGhvcjogIkRyLiBCcm9kYSIKb3V0cHV0OiBodG1sX25vdGVib29rIAotLS0KCiMgTG9hZCBpbiB5b3VyIHBhY2thZ2VzIApUaGVzZSBjYW4gYWxzbyBiZSBmb3VuZCBvbiB0aGUgcmlnaHQgc2lkZSBtZW51IHVuZGVyICJQYWNrYWdlcyIuIFBhY2thZ2VzIGFyZSBtYWRlIGJ5IG90aGVyIFIgdXNlcnMgYW5kIGFyZSBncm91cHMgb2YgZnVuY3Rpb25zIHRoYXQgbWFrZSB1c2luZyBSIGVhc2llci4gIFRoZXJlIGFyZSBjZXJ0YWluIHBhY2thZ2VzIHdlIHRlbmQgdG8gdXNlIG1vcmUgb2Z0ZW4gdGhhbiBub3Qgc28gaXQncyB1c3VhbGx5IGdvb2QgdG8gc3RhcnQgd2l0aCBsb2FkaW5nIHRoZXNlIHBhY2thZ2VzLiBJZiB5b3UgZG9uJ3QgaGF2ZSB0aGVzZSBpbiB5b3VyIGxpYnJhcnksIHlvdSB3aWxsIGhhdmUgdG8gY2xpY2sgdGhlICJpbnN0YWxsIiBidXR0b24gaW4gdGhlIHBhY2thZ2VzIG1lbnUuCgpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkocHN5Y2gpCmxpYnJhcnkoaGF2ZW4pCmBgYAoKIyBMb2FkIGluIHRoZSBEYXRhClRoZXJlIGFyZSBsb3RzIG9mIHdheXMgdG8gbG9hZCBkYXRhIGludG8gUi4gQnV0IGhlcmUgaXMgYW4gZWFzeSBvbmU6IGdvIHRvICJGaWxlcyIgb24gdGhlIHJpZ2h0IHNpZGUsIGFuZCBzZWxlY3QgdGhlIGZpbGUgeW91IHdvdWxkIGxpa2UgdG8gdXNlLCBjbGljayB0aGUgZmlsZSBhbmQgImltcG9ydCBkYXRhc2V0Ii4gQ29weSBhbmQgcGFzdGUgdGhlIGNvZGUgcHJldmlldyBpbnRvIFIgbm90ZWJvb2sgKG9yIGp1c3QgY2xpY2sgaW1wb3J0IGRhdGFzZXQpLiBgVmlld2AgYW5kIGBnbGltcHNlYCBhcmUgZ3JlYXQgd2F5cyB0byBnZXQgYW4gaW5pdGlhbCBmZWVsIGZvciB0aGUgc3RydWN0dXJlIG9mIHlvdXIgZGF0YS4KYGBge3J9CmdzcyA8LSByZWFkX2R0YSgiZGVzY3JpcHRpdmVfZ3NzLmR0YSIpCgpWaWV3KGdzcykKCmdsaW1wc2UoZ3NzKQpgYGAKCiMgV29uZGVyaW5nIGFib3V0IHZhcmlhYmxlIGxhYmVscz8gVGhlIGBzdHJgIGZ1bmN0aW9uIGNhbiBoZWxwLi4uCmBgYHtyfQpzdHIoZ3NzJG1hcml0YWwpCnN0cihnc3Mkc2V4KQpzdHIoZ3NzJGNoaWxkcykKYGBgCgojIEJhc2ljIEV4cGxvcmF0b3J5IFRvb2xzOiBEZXNjcmlwdGl2ZSBTdGF0aXN0aWNzLCBUYWJ1bGF0aW9ucywgYW5kIEhpc3RvZ3JhbXMKIyMgRGVzY3JpcHRpdmUgU3RhdGlzdGljcyBVc2luZyB0aGUgYGRlc2NyaWJlYCBGdW5jdGlvbiBmcm9tIHRoZSBgcHN5Y2hgIFBhY2thZ2UKYGBge3J9CmRlc2NyaWJlKGdzcywgZmFzdCA9IFRSVUUpCmBgYAoKIyBUYWJ1bGF0aW9ucyB3aXRoIHRoZSBgdGFibGVgIGNvbW1hbmQKRm9yIGNhdGVnb3JpY2FsIHZhcmlhYmxlcywgdGhlIHRhYmxlIGNvbW1hbmQgaXMgYSB3b3JraG9yc2UgZm9yIGRhdGEgZXhwbG9yYXRpb24uIFRoZSB0YWJsZSBjb21tYW5kIGFsbG93cyB5b3UgdG8gZXhhbWluZSBmcmVxdWVuY2llcyBmb3IgZGlmZmVyZW50IGxldmVscyBvZiBjYXRlZ29yaWNhbCB2YXJpYWJsZXMuCmBgYHtyfQp0YWJsZShnc3MkbWFyaXRhbCkKdGFibGUoZ3NzJHNleCkKdGFibGUoZ3NzJGNoaWxkcykKYGBgCgpBIGZyZXF1ZW5jeSBoaXN0b3JncmFtIGlzIGEgbmljZSDigJxxdWljayBhbmQgZGlydHnigJ0gd2F5IHRvIGV4cGxvcmUgYSBjb250aW51b3VzIHZhcmlhYmxlLiBUaGUgaGlzdG9ncmFtIGZ1bmN0aW9uIGNvbWVzIGZyb20gdGhlIGBnZ3Bsb3QyYCBwYWNrYWdlLiBUaGVyZSBhcmUgZGlmZmVyZW50IHdheXMgdG8gY3JlYXRlIGEgaGlzdG9ncmFtIGluIFJzdHVkaW8uIEhlcmUgaXMgdGhlIGVhc2llc3Qgd2F5LCBhZ2FpbiB1c2luZyB0aGUgY2hpbGRzIHZhcmlhYmxlOgpgYGB7cn0KZ2dwbG90KGRhdGEgPSBnc3MsIG1hcHBpbmcgPSBhZXMoeCA9IGNoaWxkcykpICsgZ2VvbV9iYXIoKQpgYGAKCkhlcmUgaXMgYSBmYW5jaWVyIHZlcnNpb24gKHNheSwgZm9yIHB1YmxpY2F0aW9uKSB3aXRoIGEgdGl0bGUgYW5kIGxhYmVscyBhZGRlZCB3aXRoIHRoZSBgbGFic2AgZnVuY3Rpb246CmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGdzcywgbWFwcGluZyA9IGFlcyh4ID0gY2hpbGRzKSkgKyBnZW9tX2JhcigpICsKICAgICAgICBsYWJzKHRpdGxlID0gIkRpc3RyaWJ1dGlvbiBvZiBOdW1iZXIgb2YgQ2hpbGRyZW4gcGVyIEZhbWlseSIsCiAgICAgICAgICAgICB4ID0gIk51bWJlciBvZiBDaGlsZHJlbiIsCiAgICAgICAgICAgICBjYXB0aW9uID0gIkRhdGEgZnJvbSB0aGUgR2VuZXJhbCBTb2NpYWwgU3VydmV5ICgyMDEyKS4gTiA9IDIsNzY1LiIpCmBgYAoK