This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Question 1
data <- read.table("C:/Users/Carlisle Ferguson/Downloads/CollegeDistance.csv", header=TRUE, sep=',')
summary(data)
## X gender ethnicity score
## Min. : 1 Length:4739 Length:4739 Min. :28.95
## 1st Qu.: 1186 Class :character Class :character 1st Qu.:43.92
## Median : 2370 Mode :character Mode :character Median :51.19
## Mean : 3955 Mean :50.89
## 3rd Qu.: 3554 3rd Qu.:57.77
## Max. :37810 Max. :72.81
## fcollege mcollege home urban
## Length:4739 Length:4739 Length:4739 Length:4739
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## unemp wage distance tuition
## Min. : 1.400 Min. : 6.590 Min. : 0.000 Min. :0.2575
## 1st Qu.: 5.900 1st Qu.: 8.850 1st Qu.: 0.400 1st Qu.:0.4850
## Median : 7.100 Median : 9.680 Median : 1.000 Median :0.8245
## Mean : 7.597 Mean : 9.501 Mean : 1.803 Mean :0.8146
## 3rd Qu.: 8.900 3rd Qu.:10.150 3rd Qu.: 2.500 3rd Qu.:1.1270
## Max. :24.900 Max. :12.960 Max. :20.000 Max. :1.4042
## education income region
## Min. :12.00 Length:4739 Length:4739
## 1st Qu.:12.00 Class :character Class :character
## Median :13.00 Mode :character Mode :character
## Mean :13.81
## 3rd Qu.:16.00
## Max. :18.00
mean(data[, 'score'])
## [1] 50.88903
median(data[, 'score'])
## [1] 51.19
mean(data[, 'distance'])
## [1] 1.80287
median(data[, 'distance'])
## [1] 1
#Questions 2-4 After subsetting the data for test scores that were above 50, both the mean and median were higher than in the full data set
df <- data.frame(data)
sub <- subset(data, score > 50, select = c(gender, score, distance))
names(sub)[names(sub) == "gender"] <- "pref_gender"
names(sub)[names(sub) == "score"] <- "test_score"
names(sub)[names(sub) == "distance"] <- "distance_from"
summary(sub)
## pref_gender test_score distance_from
## Length:2554 Min. :50.01 Min. : 0.000
## Class :character 1st Qu.:53.72 1st Qu.: 0.400
## Mode :character Median :57.25 Median : 1.000
## Mean :57.67 Mean : 1.667
## 3rd Qu.:61.16 3rd Qu.: 2.000
## Max. :72.81 Max. :20.000
mean(sub[, 'test_score'])
## [1] 57.6727
median(sub[, 'test_score'])
## [1] 57.25
mean(sub[, 'distance_from'])
## [1] 1.666836
median(sub[, 'distance_from'])
## [1] 1
#Question 5 - 6
df[df=='female'] <- 'f'
head(df)
## X gender ethnicity score fcollege mcollege home urban unemp wage distance
## 1 1 male other 39.15 yes no yes yes 6.2 8.09 0.2
## 2 2 f other 48.87 no no yes yes 6.2 8.09 0.2
## 3 3 male other 48.74 no no yes yes 6.2 8.09 0.2
## 4 4 male afam 40.40 no no yes yes 6.2 8.09 0.2
## 5 5 f other 40.48 no no no yes 5.6 8.09 0.4
## 6 6 male other 54.71 no no yes yes 5.6 8.09 0.4
## tuition education income region
## 1 0.88915 12 high other
## 2 0.88915 12 low other
## 3 0.88915 12 low other
## 4 0.88915 12 low other
## 5 0.88915 13 low other
## 6 0.88915 12 low other
#Bonus
library(readr)
urlfile = "https://raw.githubusercontent.com/carlisleferguson/RBridgeHW2/main/CollegeDistance.csv"
github_data <- read_csv(url(urlfile))
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_double(),
## gender = col_character(),
## ethnicity = col_character(),
## score = col_double(),
## fcollege = col_character(),
## mcollege = col_character(),
## home = col_character(),
## urban = col_character(),
## unemp = col_double(),
## wage = col_double(),
## distance = col_double(),
## tuition = col_double(),
## education = col_double(),
## income = col_character(),
## region = col_character()
## )
head(github_data)
## # A tibble: 6 x 15
## X1 gender ethnicity score fcollege mcollege home urban unemp wage
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 1 male other 39.2 yes no yes yes 6.20 8.09
## 2 2 female other 48.9 no no yes yes 6.20 8.09
## 3 3 male other 48.7 no no yes yes 6.20 8.09
## 4 4 male afam 40.4 no no yes yes 6.20 8.09
## 5 5 female other 40.5 no no no yes 5.60 8.09
## 6 6 male other 54.7 no no yes yes 5.60 8.09
## # ... with 5 more variables: distance <dbl>, tuition <dbl>, education <dbl>,
## # income <chr>, region <chr>