data()
#data(package = .packages(all.available = TRUE))
mydata <- force(USArrests)
head(mydata)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
Explanation of dataset
summary(mydata)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
library(psych)
mydata2 <- force(sat.act)
head(mydata2)
## gender education age ACT SATV SATQ
## 29442 2 3 19 24 500 500
## 29457 2 3 23 35 600 500
## 29498 2 3 20 21 480 470
## 29503 1 4 27 26 550 520
## 29504 1 2 33 31 600 550
## 29518 1 5 26 28 640 640
colnames(mydata2) <- c("Gender", "Education", "Age", "AmerCollTest", "Verbal", "Quant")
head(mydata2)
## Gender Education Age AmerCollTest Verbal Quant
## 29442 2 3 19 24 500 500
## 29457 2 3 23 35 600 500
## 29498 2 3 20 21 480 470
## 29503 1 4 27 26 550 520
## 29504 1 2 33 31 600 550
## 29518 1 5 26 28 640 640
colnames(mydata2)[4] <- "Collage"
head(mydata2)
## Gender Education Age Collage Verbal Quant
## 29442 2 3 19 24 500 500
## 29457 2 3 23 35 600 500
## 29498 2 3 20 21 480 470
## 29503 1 4 27 26 550 520
## 29504 1 2 33 31 600 550
## 29518 1 5 26 28 640 640
When using variables that are categorical, but are coded with numbers, use function factor
mydata2$GenderF <- factor(mydata2$Gender,
levels = c(2, 1),
labels = c("F", "M"))
head(mydata2)
## Gender Education Age Collage Verbal Quant GenderF
## 29442 2 3 19 24 500 500 F
## 29457 2 3 23 35 600 500 F
## 29498 2 3 20 21 480 470 F
## 29503 1 4 27 26 550 520 M
## 29504 1 2 33 31 600 550 M
## 29518 1 5 26 28 640 640 M
summary(mydata2)
## Gender Education Age Collage
## Min. :1.000 Min. :0.000 Min. :13.00 Min. : 3.00
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:19.00 1st Qu.:25.00
## Median :2.000 Median :3.000 Median :22.00 Median :29.00
## Mean :1.647 Mean :3.164 Mean :25.59 Mean :28.55
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:29.00 3rd Qu.:32.00
## Max. :2.000 Max. :5.000 Max. :65.00 Max. :36.00
##
## Verbal Quant GenderF
## Min. :200.0 Min. :200.0 F:453
## 1st Qu.:550.0 1st Qu.:530.0 M:247
## Median :620.0 Median :620.0
## Mean :612.2 Mean :610.2
## 3rd Qu.:700.0 3rd Qu.:700.0
## Max. :800.0 Max. :800.0
## NA's :13
#install.packages("tidyr")
library(tidyr)
mydata3 <- drop_na(mydata2)
Make mydata4, which includes only students, aged between 20 and 25 inclusive.
mydata4 <- mydata3[mydata3$Age >= 20 & mydata3$Age <= 25, ] #Filtering by age, taking only between 20 and 25
Create mydata5, which includes only females.
mydata5 <- mydata3[mydata3$GenderF == "F" , ]
Make a descriptive statistics by gender. USe function describeBy
library(psych)
describeBy(mydata3$Verbal, mydata3$GenderF)
##
## Descriptive statistics by group
## group: F
## vars n mean sd median trimmed mad min max range skew
## X1 1 442 610.66 112.81 620 618.09 103.78 200 800 600 -0.66
## kurtosis se
## X1 0.43 5.37
## ----------------------------------------------------
## group: M
## vars n mean sd median trimmed mad min max range skew
## X1 1 245 615.36 114.33 630 622.44 118.61 200 800 600 -0.63
## kurtosis se
## X1 0.14 7.3