# Option 1: Read CSV using direct file path
ss <- read.csv("StudentSurvey(1).csv", stringsAsFactors = FALSE)
#Option 2: Interactive file selection (uncomment if needed)
#ss <- read.csv("StudentSurvey(1).csv", stringsAsFactors = FALSE)
# Dimensions
dim(ss)
## [1] 79 17
# [1] 79 17
#Handle missing values
# Count missing values
sum(is.na(ss))
## [1] 1
# Remove rows with NA values (create a cleaned dataframe)
ss_clean <- na.omit(ss)
# Confirm no missing values remain
sum(is.na(ss_clean))
## [1] 0
# Check the data structure
str(ss_clean)
## 'data.frame': 78 obs. of 17 variables:
## $ Year : chr "Senior" "Sophomore" "FirstYear" "Junior" ...
## $ Sex : chr "M" "F" "M" "M" ...
## $ Smoke : chr "No" "Yes" "No" "No" ...
## $ Award : chr "Olympic" "Academy" "Nobel" "Nobel" ...
## $ HigherSAT : chr "Math" "Math" "Math" "Math" ...
## $ Exercise : int 10 4 14 3 3 5 10 13 12 12 ...
## $ TV : int 1 7 5 1 3 4 10 8 1 6 ...
## $ Height : int 71 66 72 63 65 65 66 74 60 65 ...
## $ Weight : int 180 120 208 110 150 114 128 235 115 140 ...
## $ Siblings : int 4 2 2 1 1 2 1 1 7 1 ...
## $ BirthOrder: int 4 2 1 1 1 2 1 1 8 2 ...
## $ VerbalSAT : int 540 520 550 490 720 600 640 660 670 500 ...
## $ MathSAT : int 670 630 560 630 450 550 680 710 700 670 ...
## $ SAT : int 1210 1150 1110 1120 1170 1150 1320 1370 1370 1170 ...
## $ GPA : num 3.13 2.5 2.55 3.1 2.7 3.2 2.77 3.3 3.7 2.09 ...
## $ Pulse : int 54 66 130 78 40 80 94 77 94 63 ...
## $ Piercings : int 0 3 0 0 6 4 8 0 2 2 ...
## - attr(*, "na.action")= 'omit' Named int 36
## ..- attr(*, "names")= chr "36"
Summary statistics
# Summary for VerbalSAT
summary(ss_clean$VerbalSAT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 420.0 550.0 585.0 583.3 630.0 720.0
# Average GPA
mean(ss_clean$GPA)
## [1] 3.178974
#Create a small dataframe with weight and exercise columns
df_small <- ss_clean[, c("Weight", "Exercise")]
#Access the fourth element in the first column
df_small[4, 1]
## [1] 110
# Subset female students and calculate percent
(sum(ss_clean$Sex == "F") / nrow(ss_clean)) * 100
## [1] 50
#Subset students who watch more TV than average
avg_tv <- mean(ss_clean$TV) # 5.230769
#Subset students who watch TV less than or equal than average
tv_more <- ss_clean[ss_clean$TV > avg_tv, ]
tv_less <- ss_clean[ss_clean$TV <= avg_tv, ]
# Mean GPA for these groups
mean(tv_more$GPA)
## [1] 3.167241
mean(tv_less$GPA)
## [1] 3.185918
# Access names of students with GPA above 3.5
ss_clean[ss_clean$GPA > 3.5, c("Year", "Sex", "GPA")]
## Year Sex GPA
## 9 FirstYear F 3.70
## 12 FirstYear F 3.86
## 22 Sophomore M 3.70
## 28 Sophomore F 3.79
## 33 FirstYear M 3.51
## 35 Sophomore F 3.85
## 47 Senior F 3.84
## 48 Junior M 3.60
## 53 Sophomore F 3.70
## 55 FirstYear F 3.60
## 56 Sophomore M 3.56
## 57 Senior M 3.70
## 58 FirstYear F 4.00
## 71 Sophomore M 3.60
## 72 Senior F 3.70
# Check if 72 is present in the TV column
72 %in% ss_clean$TV
## [1] FALSE
# Extract all even values from Exercise column
unique(ss_clean$Exercise[ss_clean$Exercise %% 2 == 0])
## [1] 10 4 14 12 6 2 20 8 0 24