Exercises in Data Frames

The data set is a survey of 4700 students who commute to college.

Attributes include

the distance they commute
their high school GPA
if father or mother is a college graduate

The results indicate some correlation between parents education and shorter distances, as well as increased high school GPA.
The final figures display the outliers of very short and very long distances.

# CollegeDistance.df  <- read.csv("C:\\Users\\arono\\CUNY\\R\\CollegeDistance.csv")



# note the url puts "main" between my folders
url<-"https://raw.githubusercontent.com/TheReallyBigApple/CunyAssignments/main/R/CollegeDistance.csv"
library(httr)
library(RCurl)
x <- getURL(url)

CollegeDistance.df  <- read.csv(text=x, header=T)

Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.

library(knitr)

# how do i format this nicely ?

knitr::kable(summary(CollegeDistance.df, caption='Summary of all Students'))          # score mean = 50.89

X	gender	ethnicity	score	fcollege	mcollege	home	urban	unemp	wage	distance	tuition	education	income	region
Min. : 1	Length:4739	Length:4739	Min. :28.95	Length:4739	Length:4739	Length:4739	Length:4739	Min. : 1.400	Min. : 6.590	Min. : 0.000	Min. :0.2575	Min. :12.00	Length:4739	Length:4739
1st Qu.: 1186	Class :character	Class :character	1st Qu.:43.92	Class :character	Class :character	Class :character	Class :character	1st Qu.: 5.900	1st Qu.: 8.850	1st Qu.: 0.400	1st Qu.:0.4850	1st Qu.:12.00	Class :character	Class :character
Median : 2370	Mode :character	Mode :character	Median :51.19	Mode :character	Mode :character	Mode :character	Mode :character	Median : 7.100	Median : 9.680	Median : 1.000	Median :0.8245	Median :13.00	Mode :character	Mode :character
Mean : 3955	NA	NA	Mean :50.89	NA	NA	NA	NA	Mean : 7.597	Mean : 9.501	Mean : 1.803	Mean :0.8146	Mean :13.81	NA	NA
3rd Qu.: 3554	NA	NA	3rd Qu.:57.77	NA	NA	NA	NA	3rd Qu.: 8.900	3rd Qu.:10.150	3rd Qu.: 2.500	3rd Qu.:1.1270	3rd Qu.:16.00	NA	NA
Max. :37810	NA	NA	Max. :72.81	NA	NA	NA	NA	Max. :24.900	Max. :12.960	Max. :20.000	Max. :1.4042	Max. :18.00	NA	NA

score_mean<-round(mean(CollegeDistance.df$score),2) 
score_median<-round(median(CollegeDistance.df$score),2)

distance_mean<-round(mean(CollegeDistance.df$distance),2)
distance_median<-round(median(CollegeDistance.df$distance),2)

matrix1<-matrix(c(score_mean, score_median, distance_mean, distance_median), ncol=2, byrow=TRUE)


rownames(matrix1)=c("Score","Distance")
colnames(matrix1)=c("Mean","Median")

# kable(matrix1,"html")
knitr::kable(matrix1, caption='College Scores and Distances')

College Scores and Distances
	Mean	Median
Score	50.89	51.19
Distance	1.80	1.00

Create a new data frame with a subset of the columns and rows. Make sure to rename it.

# select the gender, ethnicity, score and distance  just the records where the school is in an urban area and both parents went to college
UrbanCollegeEducatedDistance.df <- subset(CollegeDistance.df[c("X","gender", "ethnicity", "score","distance","urban")], CollegeDistance.df$urban == "yes" & 
                                    CollegeDistance.df$fcollege=="yes" & CollegeDistance.df$mcollege=="yes")

Create new column names for the new data frame.

# rename the columns to improve their clarity. Note dplyr rename() is neater but, base R is used here

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "X"] <- "u_sort"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "score"] <- "u_avg_score_as_HS_Senior"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "gender"] <- "u_gender"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "ethnicity"] <- "u_ethnicity"

names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "distance"] <- "u_distance"

Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

knitr::kable(summary(UrbanCollegeEducatedDistance.df), caption='College Scores and Distances Subset')

College Scores and Distances Subset
u_sort	u_gender	u_ethnicity	u_avg_score_as_HS_Senior	u_distance	urban
Min. : 31	Length:77	Length:77	Min. :37.99	Min. :0.0000	Length:77
1st Qu.: 1635	Class :character	Class :character	1st Qu.:50.96	1st Qu.:0.3000	Class :character
Median : 2506	Mode :character	Mode :character	Median :58.82	Median :0.5000	Mode :character
Mean : 5569	NA	NA	Mean :56.25	Mean :0.6727	NA
3rd Qu.: 3659	NA	NA	3rd Qu.:62.44	3rd Qu.:0.8000	NA
Max. :26310	NA	NA	Max. :71.36	Max. :3.2000	NA

# summary(UrbanCollegeEducatedDistance.df)


score_mean<-round(mean(UrbanCollegeEducatedDistance.df$u_avg_score_as_HS_Senior),2) 
score_median<-round(median(UrbanCollegeEducatedDistance.df$u_avg_score_as_HS_Senior),2)

distance_mean<-round(mean(UrbanCollegeEducatedDistance.df$u_distance),2)
distance_median<-round(median(UrbanCollegeEducatedDistance.df$u_distance),2)

matrix1<-matrix(c(score_mean, score_median, distance_mean, distance_median), ncol=2, byrow=TRUE)

rownames(matrix1)=c("Score","Distance")
colnames(matrix1)=c("Mean","Median")


knitr::kable(matrix1, caption='College Scores and Distances')

College Scores and Distances
	Mean	Median
Score	56.25	58.82
Distance	0.67	0.50

For at least 3 values in a column please rename so that every value in that column is updated.

# update the sort to 0, so the outliers have u_sort=0

q1 <- quantile(UrbanCollegeEducatedDistance.df$u_distance, .10 )
q3 <- quantile(UrbanCollegeEducatedDistance.df$u_distance, .90 )

sprintf("Isolating students who commuted less than %.2f or greater than %.2f", q1, q3)

## [1] "Isolating students who commuted less than 0.16 or greater than 1.50"

UrbanCollegeEducatedDistance.df <- within(UrbanCollegeEducatedDistance.df, {
  f <- u_distance < q1 | u_distance > q3
  u_sort[f] <- 0
}) 





# update other to unknown

UrbanCollegeEducatedDistance.df <- within(UrbanCollegeEducatedDistance.df, {
  f <- u_ethnicity == "other"
  u_ethnicity[f] <- "unknown"
})

Display enough rows to see examples of all of steps 1-5 above.

# spot check 10 rows in which the sort=0 , these are the outliers
u_outliers<- head(subset(UrbanCollegeEducatedDistance.df,UrbanCollegeEducatedDistance.df$u_sort==0),10)



knitr::kable(u_outliers, caption='College Distance Outliers - Urban Location/Educated Family.')

College Distance Outliers - Urban Location/Educated Family.
	u_gender	u_ethnicity	u_avg_score_as_HS_Senior	u_distance	urban	f
175	male	unknown	67.18	3.2	yes	TRUE
196	female	afam	45.36	0.1	yes	FALSE
202	male	afam	51.33	0.0	yes	FALSE
1125	male	unknown	46.77	0.1	yes	TRUE
1680	male	unknown	51.53	0.1	yes	TRUE
1682	male	unknown	65.17	0.1	yes	TRUE
1989	male	unknown	63.21	2.0	yes	TRUE
1992	male	unknown	66.45	2.0	yes	TRUE
2040	male	unknown	62.87	0.1	yes	TRUE
2527	female	unknown	59.76	1.8	yes	TRUE

# spot check 10 rows in which the sort!=0 , these in the first 2 standard deviations ( more or less )
u_norms<-head(subset(UrbanCollegeEducatedDistance.df,UrbanCollegeEducatedDistance.df$u_sort!=0),10)
knitr::kable(u_norms, caption='College Distance Norms - Urban Location/Educated Family.')

College Distance Norms - Urban Location/Educated Family.
	u_sort	u_gender	u_ethnicity	u_avg_score_as_HS_Senior	u_distance	urban	f
31	31	female	unknown	68.58	0.5	yes	TRUE
188	188	female	unknown	65.93	1.5	yes	TRUE
210	210	male	unknown	62.36	1.5	yes	TRUE
328	328	male	unknown	62.44	0.7	yes	TRUE
554	554	female	unknown	53.23	0.5	yes	TRUE
1044	1044	male	afam	38.53	0.5	yes	FALSE
1074	1074	male	afam	58.50	0.7	yes	FALSE
1097	1097	female	afam	37.99	1.0	yes	FALSE
1164	1164	male	unknown	62.04	0.8	yes	TRUE
1167	1167	female	unknown	68.81	0.8	yes	TRUE

CUNY MSDS R Week 2 Assignment

Tom Buonora

January 9, 2021

Exercises in Data Frames