PART ONE: BUILDING AND EXPLORING A DATASET WITH R

Practice entering data in three ways

## Create a vector for single student
Student_1_vector <- c(1,1,48,56,49)

## Create a matrix for multiple students
Student_matrix <- matrix(1:50, byrow=TRUE, nrow= 10)

## Create a data frame with ID numbers, demographic data, and test score data for multiple students:

# First, we need to create vectors in which to store each variable:
ID <- c(1,2,3,4,5,6,7,8,9,10)
Gender <- c("Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male")
Test_1 <- c(48,58,63,28,38,40,51,46,55,62)
Test_2 <- c(56,53,74,24,56,41,46,36,51,50)
Test_3 <- c(49,44,69,26,46,40,36,41,54,66)

# Then, we can incorporate several vectors into a dataframe:
Student_dataframe <- data.frame(ID, Gender, Test_1, Test_2, Test_3)

Student_dataframe # run this to print the dataframe to the screen
##    ID Gender Test_1 Test_2 Test_3
## 1   1 Female     48     56     49
## 2   2   Male     58     53     44
## 3   3 Female     63     74     69
## 4   4   Male     28     24     26
## 5   5 Female     38     56     46
## 6   6   Male     40     41     40
## 7   7 Female     51     46     36
## 8   8   Male     46     36     41
## 9   9 Female     55     51     54
## 10 10   Male     62     50     66
## Get the summary of the dataframe of student data:
summary(Student_dataframe)
##        ID           Gender      Test_1          Test_2     
##  Min.   : 1.00   Female:5   Min.   :28.00   Min.   :24.00  
##  1st Qu.: 3.25   Male  :5   1st Qu.:41.50   1st Qu.:42.25  
##  Median : 5.50              Median :49.50   Median :50.50  
##  Mean   : 5.50              Mean   :48.90   Mean   :48.70  
##  3rd Qu.: 7.75              3rd Qu.:57.25   3rd Qu.:55.25  
##  Max.   :10.00              Max.   :63.00   Max.   :74.00  
##      Test_3     
##  Min.   :26.00  
##  1st Qu.:40.25  
##  Median :45.00  
##  Mean   :47.10  
##  3rd Qu.:52.75  
##  Max.   :69.00
## Calculate the standard deviation of student test scores for each of the three tests:
sd_test1 <- sd(Test_1)
sd_test1
## [1] 11.2492
sd_test2 <- sd(Test_2)
sd_test2
## [1] 13.37535
sd_test3 <- sd(Test_3)
sd_test3
## [1] 13.14407
## Calculate the total score for each student by summing their scores on the three tests
TOT_SCORE <- Test_1 + Test_2 + Test_3
TOT_SCORE
##  [1] 153 155 206  78 140 121 133 123 160 178
#** Note: You can also add a total score variable to your dataframe like this:
Student_dataframe$TOT_SCORE <- Student_dataframe$Test_1 + Student_dataframe$Test_2 + Student_dataframe$Test_3

# Print the first 6 rows of the dataframe to make sure the variable was added:

head(Student_dataframe)
##   ID Gender Test_1 Test_2 Test_3 TOT_SCORE
## 1  1 Female     48     56     49       153
## 2  2   Male     58     53     44       155
## 3  3 Female     63     74     69       206
## 4  4   Male     28     24     26        78
## 5  5 Female     38     56     46       140
## 6  6   Male     40     41     40       121
## Calculate a Z score for each student in the dataframe.

# First, calculate the mean over the three tests:
Mean_TOT <- mean(TOT_SCORE)
Mean_TOT 
## [1] 144.7
# Next, calculate the sd over the three tests:
Sd_TOT <- sd(TOT_SCORE)
Sd_TOT
## [1] 34.82671
# Third, calculate the Z-score for each student by using the formula for  a Z score:
Zscore <- (TOT_SCORE - Mean_TOT)/ Sd_TOT 
Zscore
##  [1]  0.2383228  0.2957500  1.7601431 -1.9151965 -0.1349539 -0.6805121
##  [7] -0.3359490 -0.6230849  0.4393179  0.9561626
#* Note: You can add the Z scores to the dataframe like this:

Student_dataframe$Zscore <- Zscore

# Print the first 6 rows of the dataframe to make sure the variable was added:
head(Student_dataframe)
##   ID Gender Test_1 Test_2 Test_3 TOT_SCORE     Zscore
## 1  1 Female     48     56     49       153  0.2383228
## 2  2   Male     58     53     44       155  0.2957500
## 3  3 Female     63     74     69       206  1.7601431
## 4  4   Male     28     24     26        78 -1.9151965
## 5  5 Female     38     56     46       140 -0.1349539
## 6  6   Male     40     41     40       121 -0.6805121

PART TWO: SPLITTING A FILE

## Split the student data frame by gender
by_gender <- split(Student_dataframe,Student_dataframe$Gender)

## Summary statistics for male students:
summary(by_gender$Male)
##        ID        Gender      Test_1         Test_2         Test_3    
##  Min.   : 2   Female:0   Min.   :28.0   Min.   :24.0   Min.   :26.0  
##  1st Qu.: 4   Male  :5   1st Qu.:40.0   1st Qu.:36.0   1st Qu.:40.0  
##  Median : 6              Median :46.0   Median :41.0   Median :41.0  
##  Mean   : 6              Mean   :46.8   Mean   :40.8   Mean   :43.4  
##  3rd Qu.: 8              3rd Qu.:58.0   3rd Qu.:50.0   3rd Qu.:44.0  
##  Max.   :10              Max.   :62.0   Max.   :53.0   Max.   :66.0  
##    TOT_SCORE       Zscore       
##  Min.   : 78   Min.   :-1.9152  
##  1st Qu.:121   1st Qu.:-0.6805  
##  Median :123   Median :-0.6231  
##  Mean   :131   Mean   :-0.3934  
##  3rd Qu.:155   3rd Qu.: 0.2958  
##  Max.   :178   Max.   : 0.9562
# Total score for male students:
TOT_SCORE_M <- (by_gender$Male$Test_1+ by_gender$Male$Test_2+ by_gender$Male$Test_3)
TOT_SCORE_M 
## [1] 155  78 121 123 178
# Mean score for male students:
mean_TOT_M <- mean(TOT_SCORE_M)
mean_TOT_M
## [1] 131
# SD for male students:
sd_TOT_M <- sd(TOT_SCORE_M)
sd_TOT_M 
## [1] 37.94074
# Calculate Z scores for male students:
Z_M <- (TOT_SCORE_M - mean_TOT_M)/ sd_TOT_M
Z_M
## [1]  0.6325654 -1.3969152 -0.2635689 -0.2108551  1.2387738
## Summary statistics for female students:
summary(by_gender$Female)
##        ID       Gender      Test_1       Test_2         Test_3    
##  Min.   :1   Female:5   Min.   :38   Min.   :46.0   Min.   :36.0  
##  1st Qu.:3   Male  :0   1st Qu.:48   1st Qu.:51.0   1st Qu.:46.0  
##  Median :5              Median :51   Median :56.0   Median :49.0  
##  Mean   :5              Mean   :51   Mean   :56.6   Mean   :50.8  
##  3rd Qu.:7              3rd Qu.:55   3rd Qu.:56.0   3rd Qu.:54.0  
##  Max.   :9              Max.   :63   Max.   :74.0   Max.   :69.0  
##    TOT_SCORE         Zscore       
##  Min.   :133.0   Min.   :-0.3359  
##  1st Qu.:140.0   1st Qu.:-0.1350  
##  Median :153.0   Median : 0.2383  
##  Mean   :158.4   Mean   : 0.3934  
##  3rd Qu.:160.0   3rd Qu.: 0.4393  
##  Max.   :206.0   Max.   : 1.7601
# Total score for female students:
TOT_SCORE_F <- (by_gender$Female$Test_1+ by_gender$Female$Test_2+ by_gender$Female$Test_3)
TOT_SCORE_F 
## [1] 153 206 140 133 160
# Mean score for female students:
mean_TOT_F <- mean(TOT_SCORE_F)
mean_TOT_F
## [1] 158.4
# SD for female students:
sd_TOT_F <- sd(TOT_SCORE_F)
sd_TOT_F 
## [1] 28.64088
# Calculate Z score for female students:
Z_F <- (TOT_SCORE_F - mean_TOT_F)/ sd_TOT_F
Z_F
## [1] -0.18854169  1.66196011 -0.64243836 -0.88684426  0.05586421

PART THREE: IMPORTING AN EXISTING DATASET

### Import the HSB dataset:

library(haven)
hsb <- read_sav("hsb2.sav")# this part will be unique to your computer!
View(hsb)

### Check the HSB dataset contents:

## look at the first 6 lines:
head(hsb)
## # A tibble: 6 x 11
##      id female    race  ses   schtyp prog   read write  math science socst
##   <dbl> <dbl+lbl> <dbl> <dbl> <dbl+> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1  70.0 0         4     1     1      1      57.0  52.0  41.0    47.0  57.0
## 2 121   1         4     2     1      3      68.0  59.0  53.0    63.0  61.0
## 3  86.0 0         4     3     1      1      44.0  33.0  54.0    58.0  31.0
## 4 141   0         4     3     1      3      63.0  44.0  47.0    53.0  56.0
## 5 172   0         4     2     1      2      47.0  52.0  57.0    53.0  61.0
## 6 113   0         4     2     1      2      44.0  52.0  51.0    63.0  61.0
## look at the last 6 lines:
tail(hsb)
## # A tibble: 6 x 11
##      id female    race  ses   schtyp prog   read write  math science socst
##   <dbl> <dbl+lbl> <dbl> <dbl> <dbl+> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1 179   1         4     2     2      2      47.0  65.0  60.0    50.0  56.0
## 2  31.0 1         2     2     2      1      55.0  59.0  52.0    42.0  56.0
## 3 145   1         4     2     1      3      42.0  46.0  38.0    36.0  46.0
## 4 187   1         4     2     2      1      57.0  41.0  57.0    55.0  52.0
## 5 118   1         4     2     1      1      55.0  62.0  58.0    58.0  61.0
## 6 137   1         4     3     1      2      63.0  65.0  65.0    53.0  61.0
## get a summary of the variables:
summary(hsb)
##        id             female           race           ses       
##  Min.   :  1.00   Min.   :0.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.: 50.75   1st Qu.:0.000   1st Qu.:3.00   1st Qu.:2.000  
##  Median :100.50   Median :1.000   Median :4.00   Median :2.000  
##  Mean   :100.50   Mean   :0.545   Mean   :3.43   Mean   :2.055  
##  3rd Qu.:150.25   3rd Qu.:1.000   3rd Qu.:4.00   3rd Qu.:3.000  
##  Max.   :200.00   Max.   :1.000   Max.   :4.00   Max.   :3.000  
##      schtyp          prog            read           write      
##  Min.   :1.00   Min.   :1.000   Min.   :28.00   Min.   :31.00  
##  1st Qu.:1.00   1st Qu.:2.000   1st Qu.:44.00   1st Qu.:45.75  
##  Median :1.00   Median :2.000   Median :50.00   Median :54.00  
##  Mean   :1.16   Mean   :2.025   Mean   :52.23   Mean   :52.77  
##  3rd Qu.:1.00   3rd Qu.:2.250   3rd Qu.:60.00   3rd Qu.:60.00  
##  Max.   :2.00   Max.   :3.000   Max.   :76.00   Max.   :67.00  
##       math          science          socst      
##  Min.   :33.00   Min.   :26.00   Min.   :26.00  
##  1st Qu.:45.00   1st Qu.:44.00   1st Qu.:46.00  
##  Median :52.00   Median :53.00   Median :52.00  
##  Mean   :52.65   Mean   :51.85   Mean   :52.41  
##  3rd Qu.:59.00   3rd Qu.:58.00   3rd Qu.:61.00  
##  Max.   :75.00   Max.   :74.00   Max.   :71.00

PART FOUR: Explore the HSB Data Using Basic Statistics

### Create a histogram of social studies scores:
hist(hsb$socst)

### Create a simple scatterplot of math and science test scores:
plot(hsb$math, hsb$science)

# add a title and some better labels:
plot(hsb$math, hsb$science, main = "Scatterplot of Math & Science Scores",
     ylab = "Science Scores", xlab = "Math Scores")

#### Measures of association:

### Calculate the correlation and covariance by using cor() and cov()

cor(hsb$math,hsb$read)
## [1] 0.6622801
cov(hsb$math,hsb$read)
## [1] 63.61472