Practice entering data in three ways
## Create a vector for single student
Student_1_vector <- c(1,1,48,56,49)
## Create a matrix for multiple students
Student_matrix <- matrix(1:50, byrow=TRUE, nrow= 10)
## Create a data frame with ID numbers, demographic data, and test score data for multiple students:
# First, we need to create vectors in which to store each variable:
ID <- c(1,2,3,4,5,6,7,8,9,10)
Gender <- c("Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male")
Test_1 <- c(48,58,63,28,38,40,51,46,55,62)
Test_2 <- c(56,53,74,24,56,41,46,36,51,50)
Test_3 <- c(49,44,69,26,46,40,36,41,54,66)
# Then, we can incorporate several vectors into a dataframe:
Student_dataframe <- data.frame(ID, Gender, Test_1, Test_2, Test_3)
Student_dataframe # run this to print the dataframe to the screen
## ID Gender Test_1 Test_2 Test_3
## 1 1 Female 48 56 49
## 2 2 Male 58 53 44
## 3 3 Female 63 74 69
## 4 4 Male 28 24 26
## 5 5 Female 38 56 46
## 6 6 Male 40 41 40
## 7 7 Female 51 46 36
## 8 8 Male 46 36 41
## 9 9 Female 55 51 54
## 10 10 Male 62 50 66
## Get the summary of the dataframe of student data:
summary(Student_dataframe)
## ID Gender Test_1 Test_2
## Min. : 1.00 Female:5 Min. :28.00 Min. :24.00
## 1st Qu.: 3.25 Male :5 1st Qu.:41.50 1st Qu.:42.25
## Median : 5.50 Median :49.50 Median :50.50
## Mean : 5.50 Mean :48.90 Mean :48.70
## 3rd Qu.: 7.75 3rd Qu.:57.25 3rd Qu.:55.25
## Max. :10.00 Max. :63.00 Max. :74.00
## Test_3
## Min. :26.00
## 1st Qu.:40.25
## Median :45.00
## Mean :47.10
## 3rd Qu.:52.75
## Max. :69.00
## Calculate the standard deviation of student test scores for each of the three tests:
sd_test1 <- sd(Test_1)
sd_test1
## [1] 11.2492
sd_test2 <- sd(Test_2)
sd_test2
## [1] 13.37535
sd_test3 <- sd(Test_3)
sd_test3
## [1] 13.14407
## Calculate the total score for each student by summing their scores on the three tests
TOT_SCORE <- Test_1 + Test_2 + Test_3
TOT_SCORE
## [1] 153 155 206 78 140 121 133 123 160 178
#** Note: You can also add a total score variable to your dataframe like this:
Student_dataframe$TOT_SCORE <- Student_dataframe$Test_1 + Student_dataframe$Test_2 + Student_dataframe$Test_3
# Print the first 6 rows of the dataframe to make sure the variable was added:
head(Student_dataframe)
## ID Gender Test_1 Test_2 Test_3 TOT_SCORE
## 1 1 Female 48 56 49 153
## 2 2 Male 58 53 44 155
## 3 3 Female 63 74 69 206
## 4 4 Male 28 24 26 78
## 5 5 Female 38 56 46 140
## 6 6 Male 40 41 40 121
## Calculate a Z score for each student in the dataframe.
# First, calculate the mean over the three tests:
Mean_TOT <- mean(TOT_SCORE)
Mean_TOT
## [1] 144.7
# Next, calculate the sd over the three tests:
Sd_TOT <- sd(TOT_SCORE)
Sd_TOT
## [1] 34.82671
# Third, calculate the Z-score for each student by using the formula for a Z score:
Zscore <- (TOT_SCORE - Mean_TOT)/ Sd_TOT
Zscore
## [1] 0.2383228 0.2957500 1.7601431 -1.9151965 -0.1349539 -0.6805121
## [7] -0.3359490 -0.6230849 0.4393179 0.9561626
#* Note: You can add the Z scores to the dataframe like this:
Student_dataframe$Zscore <- Zscore
# Print the first 6 rows of the dataframe to make sure the variable was added:
head(Student_dataframe)
## ID Gender Test_1 Test_2 Test_3 TOT_SCORE Zscore
## 1 1 Female 48 56 49 153 0.2383228
## 2 2 Male 58 53 44 155 0.2957500
## 3 3 Female 63 74 69 206 1.7601431
## 4 4 Male 28 24 26 78 -1.9151965
## 5 5 Female 38 56 46 140 -0.1349539
## 6 6 Male 40 41 40 121 -0.6805121
PART TWO: SPLITTING A FILE
## Split the student data frame by gender
by_gender <- split(Student_dataframe,Student_dataframe$Gender)
## Summary statistics for male students:
summary(by_gender$Male)
## ID Gender Test_1 Test_2 Test_3
## Min. : 2 Female:0 Min. :28.0 Min. :24.0 Min. :26.0
## 1st Qu.: 4 Male :5 1st Qu.:40.0 1st Qu.:36.0 1st Qu.:40.0
## Median : 6 Median :46.0 Median :41.0 Median :41.0
## Mean : 6 Mean :46.8 Mean :40.8 Mean :43.4
## 3rd Qu.: 8 3rd Qu.:58.0 3rd Qu.:50.0 3rd Qu.:44.0
## Max. :10 Max. :62.0 Max. :53.0 Max. :66.0
## TOT_SCORE Zscore
## Min. : 78 Min. :-1.9152
## 1st Qu.:121 1st Qu.:-0.6805
## Median :123 Median :-0.6231
## Mean :131 Mean :-0.3934
## 3rd Qu.:155 3rd Qu.: 0.2958
## Max. :178 Max. : 0.9562
# Total score for male students:
TOT_SCORE_M <- (by_gender$Male$Test_1+ by_gender$Male$Test_2+ by_gender$Male$Test_3)
TOT_SCORE_M
## [1] 155 78 121 123 178
# Mean score for male students:
mean_TOT_M <- mean(TOT_SCORE_M)
mean_TOT_M
## [1] 131
# SD for male students:
sd_TOT_M <- sd(TOT_SCORE_M)
sd_TOT_M
## [1] 37.94074
# Calculate Z scores for male students:
Z_M <- (TOT_SCORE_M - mean_TOT_M)/ sd_TOT_M
Z_M
## [1] 0.6325654 -1.3969152 -0.2635689 -0.2108551 1.2387738
## Summary statistics for female students:
summary(by_gender$Female)
## ID Gender Test_1 Test_2 Test_3
## Min. :1 Female:5 Min. :38 Min. :46.0 Min. :36.0
## 1st Qu.:3 Male :0 1st Qu.:48 1st Qu.:51.0 1st Qu.:46.0
## Median :5 Median :51 Median :56.0 Median :49.0
## Mean :5 Mean :51 Mean :56.6 Mean :50.8
## 3rd Qu.:7 3rd Qu.:55 3rd Qu.:56.0 3rd Qu.:54.0
## Max. :9 Max. :63 Max. :74.0 Max. :69.0
## TOT_SCORE Zscore
## Min. :133.0 Min. :-0.3359
## 1st Qu.:140.0 1st Qu.:-0.1350
## Median :153.0 Median : 0.2383
## Mean :158.4 Mean : 0.3934
## 3rd Qu.:160.0 3rd Qu.: 0.4393
## Max. :206.0 Max. : 1.7601
# Total score for female students:
TOT_SCORE_F <- (by_gender$Female$Test_1+ by_gender$Female$Test_2+ by_gender$Female$Test_3)
TOT_SCORE_F
## [1] 153 206 140 133 160
# Mean score for female students:
mean_TOT_F <- mean(TOT_SCORE_F)
mean_TOT_F
## [1] 158.4
# SD for female students:
sd_TOT_F <- sd(TOT_SCORE_F)
sd_TOT_F
## [1] 28.64088
# Calculate Z score for female students:
Z_F <- (TOT_SCORE_F - mean_TOT_F)/ sd_TOT_F
Z_F
## [1] -0.18854169 1.66196011 -0.64243836 -0.88684426 0.05586421
PART THREE: IMPORTING AN EXISTING DATASET
### Import the HSB dataset:
library(haven)
hsb <- read_sav("hsb2.sav")# this part will be unique to your computer!
View(hsb)
### Check the HSB dataset contents:
## look at the first 6 lines:
head(hsb)
## # A tibble: 6 x 11
## id female race ses schtyp prog read write math science socst
## <dbl> <dbl+lbl> <dbl> <dbl> <dbl+> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 70.0 0 4 1 1 1 57.0 52.0 41.0 47.0 57.0
## 2 121 1 4 2 1 3 68.0 59.0 53.0 63.0 61.0
## 3 86.0 0 4 3 1 1 44.0 33.0 54.0 58.0 31.0
## 4 141 0 4 3 1 3 63.0 44.0 47.0 53.0 56.0
## 5 172 0 4 2 1 2 47.0 52.0 57.0 53.0 61.0
## 6 113 0 4 2 1 2 44.0 52.0 51.0 63.0 61.0
## look at the last 6 lines:
tail(hsb)
## # A tibble: 6 x 11
## id female race ses schtyp prog read write math science socst
## <dbl> <dbl+lbl> <dbl> <dbl> <dbl+> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 179 1 4 2 2 2 47.0 65.0 60.0 50.0 56.0
## 2 31.0 1 2 2 2 1 55.0 59.0 52.0 42.0 56.0
## 3 145 1 4 2 1 3 42.0 46.0 38.0 36.0 46.0
## 4 187 1 4 2 2 1 57.0 41.0 57.0 55.0 52.0
## 5 118 1 4 2 1 1 55.0 62.0 58.0 58.0 61.0
## 6 137 1 4 3 1 2 63.0 65.0 65.0 53.0 61.0
## get a summary of the variables:
summary(hsb)
## id female race ses
## Min. : 1.00 Min. :0.000 Min. :1.00 Min. :1.000
## 1st Qu.: 50.75 1st Qu.:0.000 1st Qu.:3.00 1st Qu.:2.000
## Median :100.50 Median :1.000 Median :4.00 Median :2.000
## Mean :100.50 Mean :0.545 Mean :3.43 Mean :2.055
## 3rd Qu.:150.25 3rd Qu.:1.000 3rd Qu.:4.00 3rd Qu.:3.000
## Max. :200.00 Max. :1.000 Max. :4.00 Max. :3.000
## schtyp prog read write
## Min. :1.00 Min. :1.000 Min. :28.00 Min. :31.00
## 1st Qu.:1.00 1st Qu.:2.000 1st Qu.:44.00 1st Qu.:45.75
## Median :1.00 Median :2.000 Median :50.00 Median :54.00
## Mean :1.16 Mean :2.025 Mean :52.23 Mean :52.77
## 3rd Qu.:1.00 3rd Qu.:2.250 3rd Qu.:60.00 3rd Qu.:60.00
## Max. :2.00 Max. :3.000 Max. :76.00 Max. :67.00
## math science socst
## Min. :33.00 Min. :26.00 Min. :26.00
## 1st Qu.:45.00 1st Qu.:44.00 1st Qu.:46.00
## Median :52.00 Median :53.00 Median :52.00
## Mean :52.65 Mean :51.85 Mean :52.41
## 3rd Qu.:59.00 3rd Qu.:58.00 3rd Qu.:61.00
## Max. :75.00 Max. :74.00 Max. :71.00