R HW 2

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#Question 1

data <- read.table("C:/Users/Carlisle Ferguson/Downloads/CollegeDistance.csv", header=TRUE, sep=',')
summary(data)

##        X            gender           ethnicity             score      
##  Min.   :    1   Length:4739        Length:4739        Min.   :28.95  
##  1st Qu.: 1186   Class :character   Class :character   1st Qu.:43.92  
##  Median : 2370   Mode  :character   Mode  :character   Median :51.19  
##  Mean   : 3955                                         Mean   :50.89  
##  3rd Qu.: 3554                                         3rd Qu.:57.77  
##  Max.   :37810                                         Max.   :72.81  
##    fcollege           mcollege             home              urban          
##  Length:4739        Length:4739        Length:4739        Length:4739       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      unemp             wage           distance         tuition      
##  Min.   : 1.400   Min.   : 6.590   Min.   : 0.000   Min.   :0.2575  
##  1st Qu.: 5.900   1st Qu.: 8.850   1st Qu.: 0.400   1st Qu.:0.4850  
##  Median : 7.100   Median : 9.680   Median : 1.000   Median :0.8245  
##  Mean   : 7.597   Mean   : 9.501   Mean   : 1.803   Mean   :0.8146  
##  3rd Qu.: 8.900   3rd Qu.:10.150   3rd Qu.: 2.500   3rd Qu.:1.1270  
##  Max.   :24.900   Max.   :12.960   Max.   :20.000   Max.   :1.4042  
##    education        income             region         
##  Min.   :12.00   Length:4739        Length:4739       
##  1st Qu.:12.00   Class :character   Class :character  
##  Median :13.00   Mode  :character   Mode  :character  
##  Mean   :13.81                                        
##  3rd Qu.:16.00                                        
##  Max.   :18.00

mean(data[, 'score'])

## [1] 50.88903

median(data[, 'score'])

## [1] 51.19

mean(data[, 'distance'])

## [1] 1.80287

median(data[, 'distance'])

## [1] 1

#Questions 2-4 After subsetting the data for test scores that were above 50, both the mean and median were higher than in the full data set

df <- data.frame(data)
sub <- subset(data, score > 50, select = c(gender, score, distance))
names(sub)[names(sub) == "gender"] <- "pref_gender"
names(sub)[names(sub) == "score"] <- "test_score"
names(sub)[names(sub) == "distance"] <- "distance_from"


summary(sub)

##  pref_gender          test_score    distance_from   
##  Length:2554        Min.   :50.01   Min.   : 0.000  
##  Class :character   1st Qu.:53.72   1st Qu.: 0.400  
##  Mode  :character   Median :57.25   Median : 1.000  
##                     Mean   :57.67   Mean   : 1.667  
##                     3rd Qu.:61.16   3rd Qu.: 2.000  
##                     Max.   :72.81   Max.   :20.000

mean(sub[, 'test_score'])

## [1] 57.6727

median(sub[, 'test_score'])

## [1] 57.25

mean(sub[, 'distance_from'])

## [1] 1.666836

median(sub[, 'distance_from'])

## [1] 1

#Question 5 - 6

df[df=='female'] <- 'f'
head(df)

##   X gender ethnicity score fcollege mcollege home urban unemp wage distance
## 1 1   male     other 39.15      yes       no  yes   yes   6.2 8.09      0.2
## 2 2      f     other 48.87       no       no  yes   yes   6.2 8.09      0.2
## 3 3   male     other 48.74       no       no  yes   yes   6.2 8.09      0.2
## 4 4   male      afam 40.40       no       no  yes   yes   6.2 8.09      0.2
## 5 5      f     other 40.48       no       no   no   yes   5.6 8.09      0.4
## 6 6   male     other 54.71       no       no  yes   yes   5.6 8.09      0.4
##   tuition education income region
## 1 0.88915        12   high  other
## 2 0.88915        12    low  other
## 3 0.88915        12    low  other
## 4 0.88915        12    low  other
## 5 0.88915        13    low  other
## 6 0.88915        12    low  other

#Bonus

library(readr)
urlfile = "https://raw.githubusercontent.com/carlisleferguson/RBridgeHW2/main/CollegeDistance.csv"
github_data <- read_csv(url(urlfile))

## Warning: Missing column names filled in: 'X1' [1]

## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   gender = col_character(),
##   ethnicity = col_character(),
##   score = col_double(),
##   fcollege = col_character(),
##   mcollege = col_character(),
##   home = col_character(),
##   urban = col_character(),
##   unemp = col_double(),
##   wage = col_double(),
##   distance = col_double(),
##   tuition = col_double(),
##   education = col_double(),
##   income = col_character(),
##   region = col_character()
## )

head(github_data)

## # A tibble: 6 x 15
##      X1 gender ethnicity score fcollege mcollege home  urban unemp  wage
##   <dbl> <chr>  <chr>     <dbl> <chr>    <chr>    <chr> <chr> <dbl> <dbl>
## 1     1 male   other      39.2 yes      no       yes   yes    6.20  8.09
## 2     2 female other      48.9 no       no       yes   yes    6.20  8.09
## 3     3 male   other      48.7 no       no       yes   yes    6.20  8.09
## 4     4 male   afam       40.4 no       no       yes   yes    6.20  8.09
## 5     5 female other      40.5 no       no       no    yes    5.60  8.09
## 6     6 male   other      54.7 no       no       yes   yes    5.60  8.09
## # ... with 5 more variables: distance <dbl>, tuition <dbl>, education <dbl>,
## #   income <chr>, region <chr>

R HW 2

Carlisle Ferguson

1/9/2021

R Markdown