Exercises in Data Frames
The data set is a survey of 4700 students who commute to college.
Attributes include
-
the distance they commute
-
their high school GPA
-
if father or mother is a college graduate
The results indicate some correlation between parents education and shorter distances, as well as increased high school GPA.
The final figures display the outliers of very short and very long distances.
# CollegeDistance.df <- read.csv("C:\\Users\\arono\\CUNY\\R\\CollegeDistance.csv")
# note the url puts "main" between my folders
url<-"https://raw.githubusercontent.com/TheReallyBigApple/CunyAssignments/main/R/CollegeDistance.csv"
library(httr)
library(RCurl)
x <- getURL(url)
CollegeDistance.df <- read.csv(text=x, header=T)
- Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.
library(knitr)
# how do i format this nicely ?
knitr::kable(summary(CollegeDistance.df, caption='Summary of all Students')) # score mean = 50.89
|
Min. : 1 |
Length:4739 |
Length:4739 |
Min. :28.95 |
Length:4739 |
Length:4739 |
Length:4739 |
Length:4739 |
Min. : 1.400 |
Min. : 6.590 |
Min. : 0.000 |
Min. :0.2575 |
Min. :12.00 |
Length:4739 |
Length:4739 |
|
1st Qu.: 1186 |
Class :character |
Class :character |
1st Qu.:43.92 |
Class :character |
Class :character |
Class :character |
Class :character |
1st Qu.: 5.900 |
1st Qu.: 8.850 |
1st Qu.: 0.400 |
1st Qu.:0.4850 |
1st Qu.:12.00 |
Class :character |
Class :character |
|
Median : 2370 |
Mode :character |
Mode :character |
Median :51.19 |
Mode :character |
Mode :character |
Mode :character |
Mode :character |
Median : 7.100 |
Median : 9.680 |
Median : 1.000 |
Median :0.8245 |
Median :13.00 |
Mode :character |
Mode :character |
|
Mean : 3955 |
NA |
NA |
Mean :50.89 |
NA |
NA |
NA |
NA |
Mean : 7.597 |
Mean : 9.501 |
Mean : 1.803 |
Mean :0.8146 |
Mean :13.81 |
NA |
NA |
|
3rd Qu.: 3554 |
NA |
NA |
3rd Qu.:57.77 |
NA |
NA |
NA |
NA |
3rd Qu.: 8.900 |
3rd Qu.:10.150 |
3rd Qu.: 2.500 |
3rd Qu.:1.1270 |
3rd Qu.:16.00 |
NA |
NA |
|
Max. :37810 |
NA |
NA |
Max. :72.81 |
NA |
NA |
NA |
NA |
Max. :24.900 |
Max. :12.960 |
Max. :20.000 |
Max. :1.4042 |
Max. :18.00 |
NA |
NA |
score_mean<-round(mean(CollegeDistance.df$score),2)
score_median<-round(median(CollegeDistance.df$score),2)
distance_mean<-round(mean(CollegeDistance.df$distance),2)
distance_median<-round(median(CollegeDistance.df$distance),2)
matrix1<-matrix(c(score_mean, score_median, distance_mean, distance_median), ncol=2, byrow=TRUE)
rownames(matrix1)=c("Score","Distance")
colnames(matrix1)=c("Mean","Median")
# kable(matrix1,"html")
knitr::kable(matrix1, caption='College Scores and Distances')
College Scores and Distances
| Score |
50.89 |
51.19 |
| Distance |
1.80 |
1.00 |
- Create a new data frame with a subset of the columns and rows. Make sure to rename it.
# select the gender, ethnicity, score and distance just the records where the school is in an urban area and both parents went to college
UrbanCollegeEducatedDistance.df <- subset(CollegeDistance.df[c("X","gender", "ethnicity", "score","distance","urban")], CollegeDistance.df$urban == "yes" &
CollegeDistance.df$fcollege=="yes" & CollegeDistance.df$mcollege=="yes")
- Create new column names for the new data frame.
# rename the columns to improve their clarity. Note dplyr rename() is neater but, base R is used here
names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "X"] <- "u_sort"
names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "score"] <- "u_avg_score_as_HS_Senior"
names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "gender"] <- "u_gender"
names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "ethnicity"] <- "u_ethnicity"
names(UrbanCollegeEducatedDistance.df)[names(UrbanCollegeEducatedDistance.df) == "distance"] <- "u_distance"
- Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.
knitr::kable(summary(UrbanCollegeEducatedDistance.df), caption='College Scores and Distances Subset')
College Scores and Distances Subset
|
Min. : 31 |
Length:77 |
Length:77 |
Min. :37.99 |
Min. :0.0000 |
Length:77 |
|
1st Qu.: 1635 |
Class :character |
Class :character |
1st Qu.:50.96 |
1st Qu.:0.3000 |
Class :character |
|
Median : 2506 |
Mode :character |
Mode :character |
Median :58.82 |
Median :0.5000 |
Mode :character |
|
Mean : 5569 |
NA |
NA |
Mean :56.25 |
Mean :0.6727 |
NA |
|
3rd Qu.: 3659 |
NA |
NA |
3rd Qu.:62.44 |
3rd Qu.:0.8000 |
NA |
|
Max. :26310 |
NA |
NA |
Max. :71.36 |
Max. :3.2000 |
NA |
# summary(UrbanCollegeEducatedDistance.df)
score_mean<-round(mean(UrbanCollegeEducatedDistance.df$u_avg_score_as_HS_Senior),2)
score_median<-round(median(UrbanCollegeEducatedDistance.df$u_avg_score_as_HS_Senior),2)
distance_mean<-round(mean(UrbanCollegeEducatedDistance.df$u_distance),2)
distance_median<-round(median(UrbanCollegeEducatedDistance.df$u_distance),2)
matrix1<-matrix(c(score_mean, score_median, distance_mean, distance_median), ncol=2, byrow=TRUE)
rownames(matrix1)=c("Score","Distance")
colnames(matrix1)=c("Mean","Median")
knitr::kable(matrix1, caption='College Scores and Distances')
College Scores and Distances
| Score |
56.25 |
58.82 |
| Distance |
0.67 |
0.50 |
- For at least 3 values in a column please rename so that every value in that column is updated.
# update the sort to 0, so the outliers have u_sort=0
q1 <- quantile(UrbanCollegeEducatedDistance.df$u_distance, .10 )
q3 <- quantile(UrbanCollegeEducatedDistance.df$u_distance, .90 )
sprintf("Isolating students who commuted less than %.2f or greater than %.2f", q1, q3)
## [1] "Isolating students who commuted less than 0.16 or greater than 1.50"
UrbanCollegeEducatedDistance.df <- within(UrbanCollegeEducatedDistance.df, {
f <- u_distance < q1 | u_distance > q3
u_sort[f] <- 0
})
# update other to unknown
UrbanCollegeEducatedDistance.df <- within(UrbanCollegeEducatedDistance.df, {
f <- u_ethnicity == "other"
u_ethnicity[f] <- "unknown"
})
- Display enough rows to see examples of all of steps 1-5 above.
# spot check 10 rows in which the sort=0 , these are the outliers
u_outliers<- head(subset(UrbanCollegeEducatedDistance.df,UrbanCollegeEducatedDistance.df$u_sort==0),10)
knitr::kable(u_outliers, caption='College Distance Outliers - Urban Location/Educated Family.')
College Distance Outliers - Urban Location/Educated Family.
| 175 |
0 |
male |
unknown |
67.18 |
3.2 |
yes |
TRUE |
| 196 |
0 |
female |
afam |
45.36 |
0.1 |
yes |
FALSE |
| 202 |
0 |
male |
afam |
51.33 |
0.0 |
yes |
FALSE |
| 1125 |
0 |
male |
unknown |
46.77 |
0.1 |
yes |
TRUE |
| 1680 |
0 |
male |
unknown |
51.53 |
0.1 |
yes |
TRUE |
| 1682 |
0 |
male |
unknown |
65.17 |
0.1 |
yes |
TRUE |
| 1989 |
0 |
male |
unknown |
63.21 |
2.0 |
yes |
TRUE |
| 1992 |
0 |
male |
unknown |
66.45 |
2.0 |
yes |
TRUE |
| 2040 |
0 |
male |
unknown |
62.87 |
0.1 |
yes |
TRUE |
| 2527 |
0 |
female |
unknown |
59.76 |
1.8 |
yes |
TRUE |
# spot check 10 rows in which the sort!=0 , these in the first 2 standard deviations ( more or less )
u_norms<-head(subset(UrbanCollegeEducatedDistance.df,UrbanCollegeEducatedDistance.df$u_sort!=0),10)
knitr::kable(u_norms, caption='College Distance Norms - Urban Location/Educated Family.')
College Distance Norms - Urban Location/Educated Family.
| 31 |
31 |
female |
unknown |
68.58 |
0.5 |
yes |
TRUE |
| 188 |
188 |
female |
unknown |
65.93 |
1.5 |
yes |
TRUE |
| 210 |
210 |
male |
unknown |
62.36 |
1.5 |
yes |
TRUE |
| 328 |
328 |
male |
unknown |
62.44 |
0.7 |
yes |
TRUE |
| 554 |
554 |
female |
unknown |
53.23 |
0.5 |
yes |
TRUE |
| 1044 |
1044 |
male |
afam |
38.53 |
0.5 |
yes |
FALSE |
| 1074 |
1074 |
male |
afam |
58.50 |
0.7 |
yes |
FALSE |
| 1097 |
1097 |
female |
afam |
37.99 |
1.0 |
yes |
FALSE |
| 1164 |
1164 |
male |
unknown |
62.04 |
0.8 |
yes |
TRUE |
| 1167 |
1167 |
female |
unknown |
68.81 |
0.8 |
yes |
TRUE |