Exercises in Data Frames


The data set is a survey of 4700 students who commute to college.

Attributes include
  1. the distance they commute
  2. their high school GPA
  3. if father or mother is a college graduate


The results indicate some correlation between parents education and shorter distances, as well as increased high school GPA.
The final figures display the outliers of very short and very long distances.




# CollegeDistance.df  <- read.csv("C:\\Users\\arono\\CUNY\\R\\CollegeDistance.csv")



# note the url puts "main" between my folders
url<-"https://raw.githubusercontent.com/TheReallyBigApple/CunyAssignments/main/R/CollegeDistance.csv"
library(httr)
library(RCurl)
x <- getURL(url)

CollegeDistance.df  <- read.csv(text=x, header=T)


  1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.


library(knitr)

# how do i format this nicely ?

knitr::kable(summary(CollegeDistance.df, caption='Summary of all Students'))          # score mean = 50.89
X gender ethnicity score fcollege mcollege home urban unemp wage distance tuition education income region
Min. : 1 Length:4739 Length:4739 Min. :28.95 Length:4739 Length:4739 Length:4739 Length:4739 Min. : 1.400 Min. : 6.590 Min. : 0.000 Min. :0.2575 Min. :12.00 Length:4739 Length:4739
1st Qu.: 1186 Class :character Class :character 1st Qu.:43.92 Class :character Class :character Class :character Class :character 1st Qu.: 5.900 1st Qu.: 8.850 1st Qu.: 0.400 1st Qu.:0.4850 1st Qu.:12.00 Class :character Class :character
Median : 2370 Mode :character Mode :character Median :51.19 Mode :character Mode :character Mode :character Mode :character Median : 7.100 Median : 9.680 Median : 1.000 Median :0.8245 Median :13.00 Mode :character Mode :character
Mean : 3955 NA NA Mean :50.89 NA NA NA NA Mean : 7.597 Mean : 9.501 Mean : 1.803 Mean :0.8146 Mean :13.81 NA NA
3rd Qu.: 3554 NA NA 3rd Qu.:57.77 NA NA NA NA 3rd Qu.: 8.900 3rd Qu.:10.150 3rd Qu.: 2.500 3rd Qu.:1.1270 3rd Qu.:16.00 NA NA
Max. :37810 NA NA Max. :72.81 NA NA NA NA Max. :24.900 Max. :12.960 Max. :20.000 Max. :1.4042 Max. :18.00 NA NA
score_mean<-round(mean(CollegeDistance.df$score),2) 
score_median<-round(median(CollegeDistance.df$score),2)

distance_mean<-round(mean(CollegeDistance.df$distance),2)
distance_median<-round(median(CollegeDistance.df$distance),2)

matrix1<-matrix(c(score_mean, score_median, distance_mean, distance_median), ncol=2, byrow=TRUE)


rownames(matrix1)=c("Score","Distance")
colnames(matrix1)=c("Mean","Median")

# kable(matrix1,"html")
knitr::kable(matrix1, caption='College Scores and Distances')
College Scores and Distances
Mean Median
Score 50.89 51.19
Distance 1.80 1.00


  1. Create a new data frame with a subset of the columns and rows. Make sure to rename it.
# select the gender, ethnicity, score and distance  just the records where the school is in an urban area and both parents went to college
UrbanCollegeEducatedDistance.df <- subset(CollegeDistance.df[c("X","gender", "ethnicity", "score","distance","urban")], CollegeDistance.df$urban == "yes" & 
                                    CollegeDistance.df$fcollege=="yes" & CollegeDistance.df$mcollege=="yes")


  1. Create new column names for the new data frame.


# rename the columns to improve their clarity. Note dplyr rename() is neater but, base R is used here

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "X"] <- "u_sort"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "score"] <- "u_avg_score_as_HS_Senior"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "gender"] <- "u_gender"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "ethnicity"] <- "u_ethnicity"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "distance"] <- "u_distance"


  1. Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.


knitr::kable(summary(UrbanCollegeEducatedDistance.df), caption='College Scores and Distances Subset')
College Scores and Distances Subset
u_sort u_gender u_ethnicity u_avg_score_as_HS_Senior u_distance urban
Min. : 31 Length:77 Length:77 Min. :37.99 Min. :0.0000 Length:77
1st Qu.: 1635 Class :character Class :character 1st Qu.:50.96 1st Qu.:0.3000 Class :character
Median : 2506 Mode :character Mode :character Median :58.82 Median :0.5000 Mode :character
Mean : 5569 NA NA Mean :56.25 Mean :0.6727 NA
3rd Qu.: 3659 NA NA 3rd Qu.:62.44 3rd Qu.:0.8000 NA
Max. :26310 NA NA Max. :71.36 Max. :3.2000 NA
# summary(UrbanCollegeEducatedDistance.df)


score_mean<-round(mean(UrbanCollegeEducatedDistance.df$u_avg_score_as_HS_Senior),2) 
score_median<-round(median(UrbanCollegeEducatedDistance.df$u_avg_score_as_HS_Senior),2)

distance_mean<-round(mean(UrbanCollegeEducatedDistance.df$u_distance),2)
distance_median<-round(median(UrbanCollegeEducatedDistance.df$u_distance),2)

matrix1<-matrix(c(score_mean, score_median, distance_mean, distance_median), ncol=2, byrow=TRUE)

rownames(matrix1)=c("Score","Distance")
colnames(matrix1)=c("Mean","Median")


knitr::kable(matrix1, caption='College Scores and Distances')
College Scores and Distances
Mean Median
Score 56.25 58.82
Distance 0.67 0.50


  1. For at least 3 values in a column please rename so that every value in that column is updated.


# update the sort to 0, so the outliers have u_sort=0

q1 <- quantile(UrbanCollegeEducatedDistance.df$u_distance, .10 )
q3 <- quantile(UrbanCollegeEducatedDistance.df$u_distance, .90 )

sprintf("Isolating students who commuted less than %.2f or greater than %.2f", q1, q3)
## [1] "Isolating students who commuted less than 0.16 or greater than 1.50"
UrbanCollegeEducatedDistance.df <- within(UrbanCollegeEducatedDistance.df, {
  f <- u_distance < q1 | u_distance > q3
  u_sort[f] <- 0
}) 





# update other to unknown

UrbanCollegeEducatedDistance.df <- within(UrbanCollegeEducatedDistance.df, {
  f <- u_ethnicity == "other"
  u_ethnicity[f] <- "unknown"
}) 


  1. Display enough rows to see examples of all of steps 1-5 above.


# spot check 10 rows in which the sort=0 , these are the outliers
u_outliers<- head(subset(UrbanCollegeEducatedDistance.df,UrbanCollegeEducatedDistance.df$u_sort==0),10)



knitr::kable(u_outliers, caption='College Distance Outliers - Urban Location/Educated Family.')
College Distance Outliers - Urban Location/Educated Family.
u_sort u_gender u_ethnicity u_avg_score_as_HS_Senior u_distance urban f
175 0 male unknown 67.18 3.2 yes TRUE
196 0 female afam 45.36 0.1 yes FALSE
202 0 male afam 51.33 0.0 yes FALSE
1125 0 male unknown 46.77 0.1 yes TRUE
1680 0 male unknown 51.53 0.1 yes TRUE
1682 0 male unknown 65.17 0.1 yes TRUE
1989 0 male unknown 63.21 2.0 yes TRUE
1992 0 male unknown 66.45 2.0 yes TRUE
2040 0 male unknown 62.87 0.1 yes TRUE
2527 0 female unknown 59.76 1.8 yes TRUE
# spot check 10 rows in which the sort!=0 , these in the first 2 standard deviations ( more or less )
u_norms<-head(subset(UrbanCollegeEducatedDistance.df,UrbanCollegeEducatedDistance.df$u_sort!=0),10)
knitr::kable(u_norms, caption='College Distance Norms - Urban Location/Educated Family.')
College Distance Norms - Urban Location/Educated Family.
u_sort u_gender u_ethnicity u_avg_score_as_HS_Senior u_distance urban f
31 31 female unknown 68.58 0.5 yes TRUE
188 188 female unknown 65.93 1.5 yes TRUE
210 210 male unknown 62.36 1.5 yes TRUE
328 328 male unknown 62.44 0.7 yes TRUE
554 554 female unknown 53.23 0.5 yes TRUE
1044 1044 male afam 38.53 0.5 yes FALSE
1074 1074 male afam 58.50 0.7 yes FALSE
1097 1097 female afam 37.99 1.0 yes FALSE
1164 1164 male unknown 62.04 0.8 yes TRUE
1167 1167 female unknown 68.81 0.8 yes TRUE