Load in your packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(haven)

Load in the Data

HSBS1980 <- read_dta("hsb2.dta")

View(HSBS1980)

glimpse(HSBS1980)
## Rows: 200
## Columns: 11
## $ id      <dbl> 70, 121, 86, 141, 172, 113, 50, 11, 84, 48, 75, 60, 95, 104, 3…
## $ female  <dbl+lbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ race    <dbl+lbl> 4, 4, 4, 4, 4, 4, 3, 1, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4…
## $ ses     <dbl+lbl> 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 3, 2, 3, 2…
## $ schtyp  <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1…
## $ prog    <dbl+lbl> 1, 3, 1, 3, 2, 2, 1, 2, 1, 2, 3, 2, 2, 2, 2, 1, 2, 1, 2, 1…
## $ read    <dbl> 57, 68, 44, 63, 47, 44, 50, 34, 63, 57, 60, 57, 73, 54, 45, 42…
## $ write   <dbl> 52, 59, 33, 44, 52, 52, 59, 46, 57, 55, 46, 65, 60, 63, 57, 49…
## $ math    <dbl> 41, 53, 54, 47, 57, 51, 42, 45, 54, 52, 51, 51, 71, 57, 50, 43…
## $ science <dbl> 47, 63, 58, 53, 53, 63, 53, 39, 58, 50, 53, 63, 61, 55, 31, 50…
## $ socst   <dbl> 57, 61, 31, 56, 61, 61, 61, 36, 51, 51, 61, 61, 71, 46, 56, 56…

Wondering about variable labels? The str function can help…

str(HSBS1980$prog)
##  dbl+lbl [1:200] 1, 3, 1, 3, 2, 2, 1, 2, 1, 2, 3, 2, 2, 2, 2, 1, 2, 1, 2, 1...
##  @ label       : chr "type of program"
##  @ format.stata: chr "%9.0g"
##  @ labels      : Named num [1:3] 1 2 3
##   ..- attr(*, "names")= chr [1:3] "general" "academic" "vocation"

QUESTION 1

Basic Exploratory Tools: Descriptive Statistics, Tabulations, and Histograms

Descriptive Statistics Using the describe Function from the psych Package

describe(HSBS1980$prog)
##    vars   n mean   sd median trimmed mad min max range  skew kurtosis   se
## X1    1 200 2.02 0.69      2    2.02   0   1   3     2 -0.03    -0.91 0.05

Tabulations with the table command

table(HSBS1980$prog)
## 
##   1   2   3 
##  45 105  50
library(expss)
## Loading required package: maditr
## 
## To get total summary skip 'by' argument: take_all(mtcars, mean)
## 
## Attaching package: 'maditr'
## The following objects are masked from 'package:dplyr':
## 
##     between, coalesce, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following object is masked from 'package:readr':
## 
##     cols
## 
## Attaching package: 'expss'
## The following objects are masked from 'package:haven':
## 
##     is.labelled, read_spss
## The following objects are masked from 'package:stringr':
## 
##     fixed, regex
## The following objects are masked from 'package:dplyr':
## 
##     compute, contains, na_if, recode, vars
## The following objects are masked from 'package:purrr':
## 
##     keep, modify, modify_if, when
## The following objects are masked from 'package:tidyr':
## 
##     contains, nest
## The following object is masked from 'package:ggplot2':
## 
##     vars
val_lab(HSBS1980$prog)
##  general academic vocation 
##        1        2        3
add_val_lab(HSBS1980$prog) = num_lab("
                                1 general
                                2 academic
                                3 vocation
                                ")
table(HSBS1980$prog)
## 
##  general academic vocation 
##       45      105       50

QUESTION 2

str(HSBS1980$ses)
##  dbl+lbl [1:200] 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 3, 2, 3, 2...
##  @ format.stata: chr "%9.0g"
##  @ labels      : Named num [1:3] 1 2 3
##   ..- attr(*, "names")= chr [1:3] "low" "middle" "high"
lowses=subset(HSBS1980,ses==1)
midses=subset(HSBS1980,ses==2)
highses=subset(HSBS1980,ses==3)

#Histograms#

describe(HSBS1980$write)
##    vars   n  mean   sd median trimmed   mad min max range  skew kurtosis   se
## X1    1 200 52.77 9.48     54   53.36 11.86  31  67    36 -0.47    -0.78 0.67
hist(lowses$write)

hist(midses$write)

hist(highses$write)

ggplot(data = lowses, mapping = aes(x = write)) + geom_bar() +
        labs(title = "Distribution of Writing Scores for Low SES Individuals",
             x = "Writing Scores",
             caption = "Data from the High School and Beyond Survey (1980). N = 200")

ggplot(data = midses, mapping = aes(x = write)) + geom_bar() +
        labs(title = "Distribution of Writing Scores for Middle SES Individuals",
             x = "Writing Scores",
             caption = "Data from the High School and Beyond Survey (1980). N = 200")

ggplot(data = highses, mapping = aes(x = write)) + geom_bar() +
        labs(title = "Distribution of Writing Scores for High SES Individuals",
             x = "Writing Scores",
             caption = "Data from the High School and Beyond Survey (1980). N = 200")

tinytex::install_tinytex(force = TRUE)