# R by Example 
# Chapter 5 Exploratory Data Analysis
# 5.2 Meet the Data
# a. School - the name of the college
# b. Tier - the rank of the college into one of four tiers
# c. Retention - the percentage of freshmen who return to the school the following
# year
# d. Grad.rate - the percentage of freshman who graduate in a period of six
# years
# e. Pct.20 - the percentage of classes with 20 or fewer students
# f. Pct.50 - the percentage of classes with 50 or more students
# g. Full.time - the percentage of faculty who are hired full-time
# h. Top.10 - the percentage of incoming students who were in the top ten
# percent of their high school class
# i. Accept.rate - the acceptance rate of students who apply to the college
# j. Alumni.giving - the percentage of alumni from the college who contribute
# financially
dat <- read.delim("C:/Users/aruac/OneDrive/libros de R/R by Example Course/R by Example data/college.txt")
summary(dat)
##                   School      Enrollment         Tier      
##  Pacific             :  2   Min.   : 1712   Min.   :1.000  
##  Adelphi             :  1   1st Qu.: 9814   1st Qu.:2.000  
##  Akron               :  1   Median :16478   Median :2.000  
##  Alabama             :  1   Mean   :18875   Mean   :2.527  
##  Alabama - Birmingham:  1   3rd Qu.:26856   3rd Qu.:3.000  
##  Alabama - Huntsville:  1   Max.   :67082   Max.   :4.000  
##  (Other)             :253   NA's   :3                      
##    Retention       Grad.rate         Pct.20          Pct.50     
##  Min.   :54.00   Min.   : 9.00   Min.   :18.00   Min.   : 0.00  
##  1st Qu.:76.00   1st Qu.:49.00   1st Qu.:36.50   1st Qu.: 6.00  
##  Median :83.50   Median :62.00   Median :45.00   Median :10.00  
##  Mean   :82.57   Mean   :62.98   Mean   :47.22   Mean   :10.54  
##  3rd Qu.:90.00   3rd Qu.:77.00   3rd Qu.:58.00   3rd Qu.:15.00  
##  Max.   :99.00   Max.   :98.00   Max.   :94.00   Max.   :30.00  
##  NA's   :2       NA's   :1       NA's   :5       NA's   :6      
##    Full.time          Top.10        Accept.rate     Alumni.giving  
##  Min.   : 37.00   Min.   :  7.00   Min.   :  8.00   Min.   : 1.00  
##  1st Qu.: 82.00   1st Qu.: 19.75   1st Qu.: 49.00   1st Qu.: 8.75  
##  Median : 89.00   Median : 29.50   Median : 64.00   Median :12.00  
##  Mean   : 86.27   Mean   : 39.78   Mean   : 60.84   Mean   :14.99  
##  3rd Qu.: 93.50   3rd Qu.: 53.00   3rd Qu.: 75.00   3rd Qu.:18.25  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :67.00  
##  NA's   :5        NA's   :24       NA's   :3        NA's   :8
college <- subset(dat,complete.cases(dat))
str(college)
## 'data.frame':    230 obs. of  11 variables:
##  $ School       : Factor w/ 259 levels "Adelphi","Akron",..: 73 169 258 27 124 194 163 44 33 51 ...
##  $ Enrollment   : int  19230 7497 11446 2126 10299 17833 8855 23196 12386 14060 ...
##  $ Tier         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Retention    : int  97 98 99 98 98 98 98 99 98 96 ...
##  $ Grad.rate    : int  98 96 97 88 94 94 95 95 92 95 ...
##  $ Pct.20       : int  77 75 79 71 65 72 73 77 73 71 ...
##  $ Pct.50       : num  8 9 7 6 13 12 7 8 5 5 ...
##  $ Full.time    : int  93 92 88 97 90 99 86 92 86 97 ...
##  $ Top.10       : int  95 97 97 97 97 92 99 94 86 90 ...
##  $ Accept.rate  : int  8 10 9 17 12 9 17 10 28 22 ...
##  $ Alumni.giving: int  40 61 41 31 37 35 39 36 33 39 ...
summary(college)
##                   School      Enrollment         Tier        Retention    
##  Pacific             :  2   Min.   : 2126   Min.   :1.00   Min.   :54.00  
##  Adelphi             :  1   1st Qu.:10291   1st Qu.:2.00   1st Qu.:77.00  
##  Akron               :  1   Median :17304   Median :2.00   Median :84.00  
##  Alabama             :  1   Mean   :19317   Mean   :2.37   Mean   :83.71  
##  Alabama - Birmingham:  1   3rd Qu.:26962   3rd Qu.:3.00   3rd Qu.:91.00  
##  Alabama - Huntsville:  1   Max.   :67082   Max.   :4.00   Max.   :99.00  
##  (Other)             :223                                                 
##    Grad.rate         Pct.20          Pct.50        Full.time     
##  Min.   : 9.00   Min.   :22.00   Min.   : 0.00   Min.   : 37.00  
##  1st Qu.:52.25   1st Qu.:37.00   1st Qu.: 6.00   1st Qu.: 83.00  
##  Median :65.00   Median :45.00   Median :10.00   Median : 89.00  
##  Mean   :65.44   Mean   :47.33   Mean   :10.82   Mean   : 86.86  
##  3rd Qu.:79.00   3rd Qu.:57.50   3rd Qu.:15.00   3rd Qu.: 94.00  
##  Max.   :98.00   Max.   :88.00   Max.   :30.00   Max.   :100.00  
##                                                                  
##      Top.10        Accept.rate   Alumni.giving  
##  Min.   :  7.00   Min.   : 8.0   Min.   : 1.00  
##  1st Qu.: 20.00   1st Qu.:49.0   1st Qu.: 9.00  
##  Median : 30.00   Median :64.0   Median :13.00  
##  Mean   : 40.25   Mean   :60.3   Mean   :15.35  
##  3rd Qu.: 53.00   3rd Qu.:75.0   3rd Qu.:19.00  
##  Max.   :100.00   Max.   :99.0   Max.   :61.00  
## 
names(college)
##  [1] "School"        "Enrollment"    "Tier"          "Retention"    
##  [5] "Grad.rate"     "Pct.20"        "Pct.50"        "Full.time"    
##  [9] "Top.10"        "Accept.rate"   "Alumni.giving"
# 5.3 Comparing Distributions
# One measure of the quality of a college is the variable Retention, the percentage
# of freshmen who return to the college the following year.
stripchart(college$Retention,method = "stack",pch=19,xlab = "Retention")

stripchart(Retention~Tier,method="stack",pch=19,
           xlab="Retention Percentage",
           ylab="Tier",xlim=c(50,100),data = college)
# 5.3.2 Identifying outliers
identify(college$Retention,college$Tier,n=4,
         labels = college$School)

## integer(0)
# 5.3.3 Five-number summaries and boxplots
b.output <- boxplot(Retention~Tier, data = college,horizontal=TRUE,
         ylab="Tier",xlab="Retention", main="National Universities Retention by Tier")

b.output
## $stats
##      [,1] [,2] [,3] [,4]
## [1,] 90.0   79   69   62
## [2,] 93.5   84   76   69
## [3,] 96.0   86   79   72
## [4,] 97.0   89   82   74
## [5,] 99.0   93   88   79
## attr(,"class")
##         1 
## "integer" 
## 
## $n
## [1] 51 81 60 38
## 
## $conf
##          [,1]     [,2]     [,3]     [,4]
## [1,] 95.22565 85.12222 77.77614 70.71845
## [2,] 96.77435 86.87778 80.22386 73.28155
## 
## $out
## [1] 88 65 61 61 54
## 
## $group
## [1] 1 3 4 4 4
## 
## $names
## [1] "1" "2" "3" "4"
plot(college$Retention,college$Grad.rate,
     xlab = "Retention",ylab="Graduation Rate")
fit <- line(college$Retention,college$Grad.rate)
fit
## 
## Call:
## line(college$Retention, college$Grad.rate)
## 
## Coefficients:
## [1]  -83.658    1.789
# The fitted line is given by
# GraduationRate = ???83.63+1.79×RetentionRate.
# The slope of this line is 1.79 - for every one percent increase in the retention
# rate, the average graduation rate increases by 1.79%.
abline(coef(fit),col="red")

# 5.4.2 Plotting residuals and identifying outliers
plot(college$Retention,fit$residuals,
     xlab = "Retention",ylab = "Residuals");abline(h=0,col="darkgreen",ylim=c(-150,25))
# We do notice two unusually large residuals, and we identify and label these
# residuals using the identify function.
identify(college$Retention,fit$residuals,n=2,
         labels = college$School)

## integer(0)
# Although Bridgeport has a relatively low retention percentage, it has a large positive residual which
# indicates that its graduation percentage is large given its retention percentage.
# In contrast, New Orleans has a large negative
# residual. This school's graduation percentage is lower than one would predict
# from its retention percentage.
bgsu <- read.delim("C:/Users/aruac/OneDrive/libros de R/R by Example Course/R by Example data/bgsu.txt")
plot(x=bgsu$Year,y=bgsu$Enrollment,ylab = "Enrollment",xlab = "Year")
fit <- lm(Enrollment~Year,data = bgsu);abline(fit,col="red")

plot(bgsu$Year,fit$residuals);abline(h=0,col="green")

# Least-squares fit to enrollment data (a) and residual plot (b). There is a clear
# curvature pattern to the residuals, indicating that the enrollment is not increasing in
# a linear fashion
# 5.5.2 Transforming by a logarithm and fitting a line
# Enrollment = a exp(b Year)
# log Enrollment = log a+ b Year
bgsu$log.Enrollment <- log(bgsu$Enrollment)
head(bgsu)
##   Year Enrollment log.Enrollment
## 1 1955       4410       8.391630
## 2 1956       4959       8.508959
## 3 1957       5227       8.561593
## 4 1958       5584       8.627661
## 5 1959       6046       8.707152
## 6 1960       6400       8.764053
plot(bgsu$Year,bgsu$Enrollment)

fit2 <- lm(log.Enrollment~Year,data = bgsu)
fit2$coefficients
##   (Intercept)          Year 
## -153.25703366    0.08268126
exp(-153.26)
## [1] 2.754404e-67
exp(0.0827)
## [1] 1.086216
# abline(fit2)
plot(bgsu$Year,fit2$residuals)
abline(h=0,col="red")

# From the R output, we see that the least-squares fit to the log enrollment
# data is logEnrollment = ???153.257+0.0827Y ear.
# This is equivalent to the exponential fit
# Enrollment = exp(???153.257+0.0827Y ear) ??? (1.086)Y ear,
# where ??? means "is proportional to." We see that BGSU's enrollment was
# increasing approximately 8.6% a year during the period between 1955 and
# 1970.

# 5.6 Exploring Fraction Data
# 5.6.1 Stemplot
# Example 5.3 (Ratings of colleges (continued)).
# One measure of quality of a university is the percentage of incoming students
# who graduated in the top ten percent of their high school class. Suppose
# we focus our attention at the "Top Ten" percentages for the Tier 1 colleges.
# We first use the subset function to extract the Tier 1 schools and put them
# in a new data frame college1:
college1 <- subset(college,Tier==1)
# A stemplot of the percentages can be produced using the stem function.
stem(college1$Top.10)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    4 | 3
##    5 | 589
##    6 | 344468
##    7 | 355599
##    8 | 02445556777888
##    9 | 00223334566677777889
##   10 | 0