# R by Example
# Chapter 5 Exploratory Data Analysis
# 5.2 Meet the Data
# a. School - the name of the college
# b. Tier - the rank of the college into one of four tiers
# c. Retention - the percentage of freshmen who return to the school the following
# year
# d. Grad.rate - the percentage of freshman who graduate in a period of six
# years
# e. Pct.20 - the percentage of classes with 20 or fewer students
# f. Pct.50 - the percentage of classes with 50 or more students
# g. Full.time - the percentage of faculty who are hired full-time
# h. Top.10 - the percentage of incoming students who were in the top ten
# percent of their high school class
# i. Accept.rate - the acceptance rate of students who apply to the college
# j. Alumni.giving - the percentage of alumni from the college who contribute
# financially
dat <- read.delim("C:/Users/aruac/OneDrive/libros de R/R by Example Course/R by Example data/college.txt")
summary(dat)
## School Enrollment Tier
## Pacific : 2 Min. : 1712 Min. :1.000
## Adelphi : 1 1st Qu.: 9814 1st Qu.:2.000
## Akron : 1 Median :16478 Median :2.000
## Alabama : 1 Mean :18875 Mean :2.527
## Alabama - Birmingham: 1 3rd Qu.:26856 3rd Qu.:3.000
## Alabama - Huntsville: 1 Max. :67082 Max. :4.000
## (Other) :253 NA's :3
## Retention Grad.rate Pct.20 Pct.50
## Min. :54.00 Min. : 9.00 Min. :18.00 Min. : 0.00
## 1st Qu.:76.00 1st Qu.:49.00 1st Qu.:36.50 1st Qu.: 6.00
## Median :83.50 Median :62.00 Median :45.00 Median :10.00
## Mean :82.57 Mean :62.98 Mean :47.22 Mean :10.54
## 3rd Qu.:90.00 3rd Qu.:77.00 3rd Qu.:58.00 3rd Qu.:15.00
## Max. :99.00 Max. :98.00 Max. :94.00 Max. :30.00
## NA's :2 NA's :1 NA's :5 NA's :6
## Full.time Top.10 Accept.rate Alumni.giving
## Min. : 37.00 Min. : 7.00 Min. : 8.00 Min. : 1.00
## 1st Qu.: 82.00 1st Qu.: 19.75 1st Qu.: 49.00 1st Qu.: 8.75
## Median : 89.00 Median : 29.50 Median : 64.00 Median :12.00
## Mean : 86.27 Mean : 39.78 Mean : 60.84 Mean :14.99
## 3rd Qu.: 93.50 3rd Qu.: 53.00 3rd Qu.: 75.00 3rd Qu.:18.25
## Max. :100.00 Max. :100.00 Max. :100.00 Max. :67.00
## NA's :5 NA's :24 NA's :3 NA's :8
college <- subset(dat,complete.cases(dat))
str(college)
## 'data.frame': 230 obs. of 11 variables:
## $ School : Factor w/ 259 levels "Adelphi","Akron",..: 73 169 258 27 124 194 163 44 33 51 ...
## $ Enrollment : int 19230 7497 11446 2126 10299 17833 8855 23196 12386 14060 ...
## $ Tier : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Retention : int 97 98 99 98 98 98 98 99 98 96 ...
## $ Grad.rate : int 98 96 97 88 94 94 95 95 92 95 ...
## $ Pct.20 : int 77 75 79 71 65 72 73 77 73 71 ...
## $ Pct.50 : num 8 9 7 6 13 12 7 8 5 5 ...
## $ Full.time : int 93 92 88 97 90 99 86 92 86 97 ...
## $ Top.10 : int 95 97 97 97 97 92 99 94 86 90 ...
## $ Accept.rate : int 8 10 9 17 12 9 17 10 28 22 ...
## $ Alumni.giving: int 40 61 41 31 37 35 39 36 33 39 ...
summary(college)
## School Enrollment Tier Retention
## Pacific : 2 Min. : 2126 Min. :1.00 Min. :54.00
## Adelphi : 1 1st Qu.:10291 1st Qu.:2.00 1st Qu.:77.00
## Akron : 1 Median :17304 Median :2.00 Median :84.00
## Alabama : 1 Mean :19317 Mean :2.37 Mean :83.71
## Alabama - Birmingham: 1 3rd Qu.:26962 3rd Qu.:3.00 3rd Qu.:91.00
## Alabama - Huntsville: 1 Max. :67082 Max. :4.00 Max. :99.00
## (Other) :223
## Grad.rate Pct.20 Pct.50 Full.time
## Min. : 9.00 Min. :22.00 Min. : 0.00 Min. : 37.00
## 1st Qu.:52.25 1st Qu.:37.00 1st Qu.: 6.00 1st Qu.: 83.00
## Median :65.00 Median :45.00 Median :10.00 Median : 89.00
## Mean :65.44 Mean :47.33 Mean :10.82 Mean : 86.86
## 3rd Qu.:79.00 3rd Qu.:57.50 3rd Qu.:15.00 3rd Qu.: 94.00
## Max. :98.00 Max. :88.00 Max. :30.00 Max. :100.00
##
## Top.10 Accept.rate Alumni.giving
## Min. : 7.00 Min. : 8.0 Min. : 1.00
## 1st Qu.: 20.00 1st Qu.:49.0 1st Qu.: 9.00
## Median : 30.00 Median :64.0 Median :13.00
## Mean : 40.25 Mean :60.3 Mean :15.35
## 3rd Qu.: 53.00 3rd Qu.:75.0 3rd Qu.:19.00
## Max. :100.00 Max. :99.0 Max. :61.00
##
names(college)
## [1] "School" "Enrollment" "Tier" "Retention"
## [5] "Grad.rate" "Pct.20" "Pct.50" "Full.time"
## [9] "Top.10" "Accept.rate" "Alumni.giving"
# 5.3 Comparing Distributions
# One measure of the quality of a college is the variable Retention, the percentage
# of freshmen who return to the college the following year.
stripchart(college$Retention,method = "stack",pch=19,xlab = "Retention")

stripchart(Retention~Tier,method="stack",pch=19,
xlab="Retention Percentage",
ylab="Tier",xlim=c(50,100),data = college)
# 5.3.2 Identifying outliers
identify(college$Retention,college$Tier,n=4,
labels = college$School)

## integer(0)
# 5.3.3 Five-number summaries and boxplots
b.output <- boxplot(Retention~Tier, data = college,horizontal=TRUE,
ylab="Tier",xlab="Retention", main="National Universities Retention by Tier")

b.output
## $stats
## [,1] [,2] [,3] [,4]
## [1,] 90.0 79 69 62
## [2,] 93.5 84 76 69
## [3,] 96.0 86 79 72
## [4,] 97.0 89 82 74
## [5,] 99.0 93 88 79
## attr(,"class")
## 1
## "integer"
##
## $n
## [1] 51 81 60 38
##
## $conf
## [,1] [,2] [,3] [,4]
## [1,] 95.22565 85.12222 77.77614 70.71845
## [2,] 96.77435 86.87778 80.22386 73.28155
##
## $out
## [1] 88 65 61 61 54
##
## $group
## [1] 1 3 4 4 4
##
## $names
## [1] "1" "2" "3" "4"
plot(college$Retention,college$Grad.rate,
xlab = "Retention",ylab="Graduation Rate")
fit <- line(college$Retention,college$Grad.rate)
fit
##
## Call:
## line(college$Retention, college$Grad.rate)
##
## Coefficients:
## [1] -83.658 1.789
# The fitted line is given by
# GraduationRate = ???83.63+1.79×RetentionRate.
# The slope of this line is 1.79 - for every one percent increase in the retention
# rate, the average graduation rate increases by 1.79%.
abline(coef(fit),col="red")

# 5.4.2 Plotting residuals and identifying outliers
plot(college$Retention,fit$residuals,
xlab = "Retention",ylab = "Residuals");abline(h=0,col="darkgreen",ylim=c(-150,25))
# We do notice two unusually large residuals, and we identify and label these
# residuals using the identify function.
identify(college$Retention,fit$residuals,n=2,
labels = college$School)

## integer(0)
# Although Bridgeport has a relatively low retention percentage, it has a large positive residual which
# indicates that its graduation percentage is large given its retention percentage.
# In contrast, New Orleans has a large negative
# residual. This school's graduation percentage is lower than one would predict
# from its retention percentage.
bgsu <- read.delim("C:/Users/aruac/OneDrive/libros de R/R by Example Course/R by Example data/bgsu.txt")
plot(x=bgsu$Year,y=bgsu$Enrollment,ylab = "Enrollment",xlab = "Year")
fit <- lm(Enrollment~Year,data = bgsu);abline(fit,col="red")

plot(bgsu$Year,fit$residuals);abline(h=0,col="green")

# Least-squares fit to enrollment data (a) and residual plot (b). There is a clear
# curvature pattern to the residuals, indicating that the enrollment is not increasing in
# a linear fashion
# 5.5.2 Transforming by a logarithm and fitting a line
# Enrollment = a exp(b Year)
# log Enrollment = log a+ b Year
bgsu$log.Enrollment <- log(bgsu$Enrollment)
head(bgsu)
## Year Enrollment log.Enrollment
## 1 1955 4410 8.391630
## 2 1956 4959 8.508959
## 3 1957 5227 8.561593
## 4 1958 5584 8.627661
## 5 1959 6046 8.707152
## 6 1960 6400 8.764053
plot(bgsu$Year,bgsu$Enrollment)

fit2 <- lm(log.Enrollment~Year,data = bgsu)
fit2$coefficients
## (Intercept) Year
## -153.25703366 0.08268126
exp(-153.26)
## [1] 2.754404e-67
exp(0.0827)
## [1] 1.086216
# abline(fit2)
plot(bgsu$Year,fit2$residuals)
abline(h=0,col="red")

# From the R output, we see that the least-squares fit to the log enrollment
# data is logEnrollment = ???153.257+0.0827Y ear.
# This is equivalent to the exponential fit
# Enrollment = exp(???153.257+0.0827Y ear) ??? (1.086)Y ear,
# where ??? means "is proportional to." We see that BGSU's enrollment was
# increasing approximately 8.6% a year during the period between 1955 and
# 1970.
# 5.6 Exploring Fraction Data
# 5.6.1 Stemplot
# Example 5.3 (Ratings of colleges (continued)).
# One measure of quality of a university is the percentage of incoming students
# who graduated in the top ten percent of their high school class. Suppose
# we focus our attention at the "Top Ten" percentages for the Tier 1 colleges.
# We first use the subset function to extract the Tier 1 schools and put them
# in a new data frame college1:
college1 <- subset(college,Tier==1)
# A stemplot of the percentages can be produced using the stem function.
stem(college1$Top.10)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 4 | 3
## 5 | 589
## 6 | 344468
## 7 | 355599
## 8 | 02445556777888
## 9 | 00223334566677777889
## 10 | 0