Example 1

data()
#data(package = .packages(all.available = TRUE))

mydata <- force(USArrests)

head(mydata)

##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

Explanation of dataset

Murder: Number of murder arrests per 100,000 citizens
Assults: Number of assault arrests per 100,000 citizens

summary(mydata)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

library(psych)
mydata2 <- force(sat.act)
head(mydata2)

##       gender education age ACT SATV SATQ
## 29442      2         3  19  24  500  500
## 29457      2         3  23  35  600  500
## 29498      2         3  20  21  480  470
## 29503      1         4  27  26  550  520
## 29504      1         2  33  31  600  550
## 29518      1         5  26  28  640  640

colnames(mydata2) <- c("Gender", "Education", "Age", "AmerCollTest", "Verbal", "Quant")

head(mydata2)

##       Gender Education Age AmerCollTest Verbal Quant
## 29442      2         3  19           24    500   500
## 29457      2         3  23           35    600   500
## 29498      2         3  20           21    480   470
## 29503      1         4  27           26    550   520
## 29504      1         2  33           31    600   550
## 29518      1         5  26           28    640   640

colnames(mydata2)[4]  <- "Collage"

head(mydata2)

##       Gender Education Age Collage Verbal Quant
## 29442      2         3  19      24    500   500
## 29457      2         3  23      35    600   500
## 29498      2         3  20      21    480   470
## 29503      1         4  27      26    550   520
## 29504      1         2  33      31    600   550
## 29518      1         5  26      28    640   640

When using variables that are categorical, but are coded with numbers, use function factor

mydata2$GenderF <- factor(mydata2$Gender, 
                          levels = c(2, 1),
                          labels = c("F", "M"))

head(mydata2)

##       Gender Education Age Collage Verbal Quant GenderF
## 29442      2         3  19      24    500   500       F
## 29457      2         3  23      35    600   500       F
## 29498      2         3  20      21    480   470       F
## 29503      1         4  27      26    550   520       M
## 29504      1         2  33      31    600   550       M
## 29518      1         5  26      28    640   640       M

summary(mydata2)

##      Gender        Education          Age           Collage     
##  Min.   :1.000   Min.   :0.000   Min.   :13.00   Min.   : 3.00  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:19.00   1st Qu.:25.00  
##  Median :2.000   Median :3.000   Median :22.00   Median :29.00  
##  Mean   :1.647   Mean   :3.164   Mean   :25.59   Mean   :28.55  
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:29.00   3rd Qu.:32.00  
##  Max.   :2.000   Max.   :5.000   Max.   :65.00   Max.   :36.00  
##                                                                 
##      Verbal          Quant       GenderF
##  Min.   :200.0   Min.   :200.0   F:453  
##  1st Qu.:550.0   1st Qu.:530.0   M:247  
##  Median :620.0   Median :620.0          
##  Mean   :612.2   Mean   :610.2          
##  3rd Qu.:700.0   3rd Qu.:700.0          
##  Max.   :800.0   Max.   :800.0          
##                  NA's   :13

#install.packages("tidyr")
library(tidyr)
mydata3 <- drop_na(mydata2)

Make mydata4, which includes only students, aged between 20 and 25 inclusive.

mydata4 <- mydata3[mydata3$Age >= 20 & mydata3$Age <= 25, ] #Filtering by age, taking only between 20 and 25

Create mydata5, which includes only females.

mydata5 <- mydata3[mydata3$GenderF == "F" , ]

Make a descriptive statistics by gender. USe function describeBy

library(psych)
describeBy(mydata3$Verbal, mydata3$GenderF)

## 
##  Descriptive statistics by group 
## group: F
##    vars   n   mean     sd median trimmed    mad min max range  skew
## X1    1 442 610.66 112.81    620  618.09 103.78 200 800   600 -0.66
##    kurtosis   se
## X1     0.43 5.37
## ---------------------------------------------------- 
## group: M
##    vars   n   mean     sd median trimmed    mad min max range  skew
## X1    1 245 615.36 114.33    630  622.44 118.61 200 800   600 -0.63
##    kurtosis  se
## X1     0.14 7.3

Example 1

2023-09-18