Check the dimensions of the dataset

#Handle missing values
# Count missing values
sum(is.na(ss))

## [1] 1

# Remove rows with NA values (create a cleaned dataframe)
ss_clean <- na.omit(ss)
# Confirm no missing values remain
sum(is.na(ss_clean))

## [1] 0

# Check the data structure
str(ss_clean)

## 'data.frame':    78 obs. of  17 variables:
##  $ Year      : chr  "Senior" "Sophomore" "FirstYear" "Junior" ...
##  $ Sex       : chr  "M" "F" "M" "M" ...
##  $ Smoke     : chr  "No" "Yes" "No" "No" ...
##  $ Award     : chr  "Olympic" "Academy" "Nobel" "Nobel" ...
##  $ HigherSAT : chr  "Math" "Math" "Math" "Math" ...
##  $ Exercise  : int  10 4 14 3 3 5 10 13 12 12 ...
##  $ TV        : int  1 7 5 1 3 4 10 8 1 6 ...
##  $ Height    : int  71 66 72 63 65 65 66 74 60 65 ...
##  $ Weight    : int  180 120 208 110 150 114 128 235 115 140 ...
##  $ Siblings  : int  4 2 2 1 1 2 1 1 7 1 ...
##  $ BirthOrder: int  4 2 1 1 1 2 1 1 8 2 ...
##  $ VerbalSAT : int  540 520 550 490 720 600 640 660 670 500 ...
##  $ MathSAT   : int  670 630 560 630 450 550 680 710 700 670 ...
##  $ SAT       : int  1210 1150 1110 1120 1170 1150 1320 1370 1370 1170 ...
##  $ GPA       : num  3.13 2.5 2.55 3.1 2.7 3.2 2.77 3.3 3.7 2.09 ...
##  $ Pulse     : int  54 66 130 78 40 80 94 77 94 63 ...
##  $ Piercings : int  0 3 0 0 6 4 8 0 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int 36
##   ..- attr(*, "names")= chr "36"

Summary statistics

# Summary for VerbalSAT
summary(ss_clean$VerbalSAT)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   420.0   550.0   585.0   583.3   630.0   720.0

# Average GPA
mean(ss_clean$GPA)

## [1] 3.178974

#Create a small dataframe with weight and exercise columns
df_small <- ss_clean[, c("Weight", "Exercise")]
#Access the fourth element in the first column
df_small[4, 1]

## [1] 110

# Subset female students and calculate percent
(sum(ss_clean$Sex == "F") / nrow(ss_clean)) * 100

## [1] 50

#Subset students who watch more TV than average
avg_tv <- mean(ss_clean$TV) # 5.230769
#Subset students who watch TV less than or equal  than average
tv_more <- ss_clean[ss_clean$TV > avg_tv, ]
tv_less <- ss_clean[ss_clean$TV <= avg_tv, ]
# Mean GPA for these groups
mean(tv_more$GPA)

## [1] 3.167241

mean(tv_less$GPA)

## [1] 3.185918

# Access names of students with GPA above 3.5
ss_clean[ss_clean$GPA > 3.5, c("Year", "Sex", "GPA")]

##         Year Sex  GPA
## 9  FirstYear   F 3.70
## 12 FirstYear   F 3.86
## 22 Sophomore   M 3.70
## 28 Sophomore   F 3.79
## 33 FirstYear   M 3.51
## 35 Sophomore   F 3.85
## 47    Senior   F 3.84
## 48    Junior   M 3.60
## 53 Sophomore   F 3.70
## 55 FirstYear   F 3.60
## 56 Sophomore   M 3.56
## 57    Senior   M 3.70
## 58 FirstYear   F 4.00
## 71 Sophomore   M 3.60
## 72    Senior   F 3.70

# Check if 72 is present in the TV column
72 %in% ss_clean$TV

## [1] FALSE

# Extract all even values from Exercise column
unique(ss_clean$Exercise[ss_clean$Exercise %% 2 == 0])

##  [1] 10  4 14 12  6  2 20  8  0 24

Working with Data Frames Homework Session 2 Answers

A.Hairumian

Set the working directory and read the data

Check the dimensions of the dataset