# head: returns the fist few rows
#head(college)
# str: returns the column names, data types and first few values
str(college)
## 'data.frame': 777 obs. of 19 variables:
## $ X : chr "Abilene Christian University" "Adelphi University" "Adrian College" "Agnes Scott College" ...
## $ Private : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Apps : int 1660 2186 1428 417 193 587 353 1899 1038 582 ...
## $ Accept : int 1232 1924 1097 349 146 479 340 1720 839 498 ...
## $ Enroll : int 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : int 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : int 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: int 2885 2683 1036 510 249 678 416 1594 973 799 ...
## $ P.Undergrad: int 537 1227 99 63 869 41 230 32 306 78 ...
## $ Outstate : int 7440 12280 11250 12960 7560 13500 13290 13868 15595 10468 ...
## $ Room.Board : int 3300 6450 3750 5450 4120 3335 5720 4826 4400 3380 ...
## $ Books : int 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : int 2200 1500 1165 875 1500 675 1500 850 500 1800 ...
## $ PhD : int 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : int 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: int 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : int 7041 10527 8735 19016 10922 9727 8861 11487 11644 8991 ...
## $ Grad.Rate : int 60 56 54 59 15 55 63 73 80 52 ...
# returns the total entries with no missing values
sum(is.na(college))
## [1] 0
# create row names
rownames(college)=college[,1]
# delete a column in a table
college <- college[,-1]
# Converting information into a factor data type
college$Private <- as.factor(college$Private)
# pairs(): produces scatter plots of every combination
pairs(college[,1:3])
plot(college$Private, college$Outstate, xlab = "Private", ylab = "Outstate")
# Create a list of all NOs
Elite <- rep("No", nrow(college))
# Adding the Elite Students as a Yes
Elite[college$Top10perc > 50] <- "Yes"
# Converting Elite into a factor
Elite <- as.factor(Elite)
# Adding Elite column to the dataframe (college)
college <- data.frame(college,Elite)
plot(college$Elite,college$Outstate, xlab = "Elite", ylab = "Outstate")
# Divide the print window into four regions
par(mfrow=c(2,2))
hist(college$Accept, breaks = 6, freq = TRUE, xlab = "Accept", ylab = "Frequencey" )
hist(college$Top10perc, breaks = 6, freq = TRUE, xlab = "Top 10 Percent", ylab = "Frequencey" )
hist(college$Grad.Rate, breaks = 6, freq = TRUE, xlab = "Graduation Rate", ylab = "Frequencey" )
hist(college$Enroll, breaks = 6, freq = TRUE, xlab = "Enroll", ylab = "Frequencey" )
dim(college)
## [1] 777 19
The data set consists of 19 columns and 777 rows
library(corrplot)
## corrplot 0.92 loaded
# making a correlation plot:
# corrplot(): makes a plot: corrplot(cor(dataset_name[, numeric coloumns only]))
# cor(): gives the correlation coef: cor(dataset_name[, numeric coloumns only])
corrplot(cor(college[,-c(1,2,19, 20,21)]))
Findings Larger blue circles are positively correlated. For example as applications increase the number of accepted applicants increases and vise versa. However, the red circles are negatively correlated so if one factor increase the other one decreases.