This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
data(algae, package="DMwR2")
algae
#Central Tendency Measures: mean, median, and mode
mean(algae$a1)
[1] 16.9235
median(algae$a1)
[1] 6.95
median(algae$a1, na.rm=TRUE)
[1] 6.95
mean(algae$NO3)
[1] NA
mean(algae$NO3, na.rm=TRUE)
[1] 3.282389
median(algae$NO3, na.rm=TRUE)
[1] 2.675
installed.packages("modes")
Package LibPath Version Priority Depends Imports LinkingTo Suggests Enhances License License_is_FOSS License_restricts_use
OS_type Archs MD5sum NeedsCompilation Built
library("modes")
Error in library("modes") : there is no package called ‘modes’
library(DMwR2)
centralValue(algae$a1)
[1] 6.95
centralValue(algae$speed)
[1] "high"
#statistics of spread
var(algae$a1)
[1] 455.7532
sd(algae$a1)
[1] 21.34838
range(algae$a1)
[1] 0.0 89.8
max(algae$a1)
[1] 89.8
min(algae$a1)
[1] 0
IQR(algae$a1)
[1] 23.3
quantile(algae$a1)
0% 25% 50% 75% 100%
0.00 1.50 6.95 24.80 89.80
quantile(algae$a1, probs=c(0.2, 0.8))
20% 80%
1.20 32.18
# find NAs
nas <- apply(algae, 1, function(r) sum(is.na(r)))
cat("The dataset contains ", sum(nas), "NA values. \n")
The dataset contains 33 NA values.
cat("The dataset contains ", sum(!complete.cases(algae)), "(out of ", nrow(algae) ,") incomplete rows. \n")
The dataset contains 16 (out of 200 ) incomplete rows.
#ways to obtain summaries over the entire dataset
summary(algae)
season size speed mxPH mnO2 Cl NO3 NH4
autumn:40 large :45 high :84 Min. :5.600 Min. : 1.500 Min. : 0.222 Min. : 0.050 Min. : 5.00
spring:53 medium:84 low :33 1st Qu.:7.700 1st Qu.: 7.725 1st Qu.: 10.981 1st Qu.: 1.296 1st Qu.: 38.33
summer:45 small :71 medium:83 Median :8.060 Median : 9.800 Median : 32.730 Median : 2.675 Median : 103.17
winter:62 Mean :8.012 Mean : 9.118 Mean : 43.636 Mean : 3.282 Mean : 501.30
3rd Qu.:8.400 3rd Qu.:10.800 3rd Qu.: 57.824 3rd Qu.: 4.446 3rd Qu.: 226.95
Max. :9.700 Max. :13.400 Max. :391.500 Max. :45.650 Max. :24064.00
NA's :1 NA's :2 NA's :10 NA's :2 NA's :2
oPO4 PO4 Chla a1 a2 a3 a4
Min. : 1.00 Min. : 1.00 Min. : 0.200 Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 15.70 1st Qu.: 41.38 1st Qu.: 2.000 1st Qu.: 1.50 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
Median : 40.15 Median :103.29 Median : 5.475 Median : 6.95 Median : 3.000 Median : 1.550 Median : 0.000
Mean : 73.59 Mean :137.88 Mean : 13.971 Mean :16.92 Mean : 7.458 Mean : 4.309 Mean : 1.992
3rd Qu.: 99.33 3rd Qu.:213.75 3rd Qu.: 18.308 3rd Qu.:24.80 3rd Qu.:11.375 3rd Qu.: 4.925 3rd Qu.: 2.400
Max. :564.60 Max. :771.60 Max. :110.456 Max. :89.80 Max. :72.600 Max. :42.800 Max. :44.600
NA's :2 NA's :2 NA's :12
a5 a6 a7
Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
Median : 1.900 Median : 0.000 Median : 1.000
Mean : 5.064 Mean : 5.964 Mean : 2.495
3rd Qu.: 7.500 3rd Qu.: 6.925 3rd Qu.: 2.400
Max. :44.400 Max. :77.600 Max. :31.600
library(Hmisc)
data(iris)
describe(iris)
iris
5 Variables 150 Observations
----------------------------------------------------------------------------------------------------------------------------------
Sepal.Length
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
150 0 35 0.998 5.843 0.9462 4.600 4.800 5.100 5.800 6.400 6.900 7.255
lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
----------------------------------------------------------------------------------------------------------------------------------
Sepal.Width
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
150 0 23 0.992 3.057 0.4872 2.345 2.500 2.800 3.000 3.300 3.610 3.800
lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
----------------------------------------------------------------------------------------------------------------------------------
Petal.Length
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
150 0 43 0.998 3.758 1.979 1.30 1.40 1.60 4.35 5.10 5.80 6.10
lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
----------------------------------------------------------------------------------------------------------------------------------
Petal.Width
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
150 0 22 0.99 1.199 0.8676 0.2 0.2 0.3 1.3 1.8 2.2 2.3
lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
----------------------------------------------------------------------------------------------------------------------------------
Species
n missing distinct
150 0 3
Value setosa versicolor virginica
Frequency 50 50 50
Proportion 0.333 0.333 0.333
----------------------------------------------------------------------------------------------------------------------------------
library(dplyr)
alg <- as_tibble(algae) #the book converted algae to alg, but
identical(alg, algae) #they are identical, so below we use just algae.
[1] TRUE
summarise(algae, avgNO3=mean(NO3, na.rm=TRUE), medA1=median(a1))
select(algae, mxPH:Cl) %>% summarise_all(list(mean, median), na.rm=TRUE )
group_by(algae, season, size) %>%
summarise(nobs=n(), mA7=median(a7))
`summarise()` regrouping output by 'season' (override with `.groups` argument)
select(algae, a1:a7) %>% summarise_all(funs(var))
select(algae, a1:a7) %>% summarise_all(c("min", "max"))
data(iris)
group_by(iris, Species) %>% summarise(var=var(Sepal.Length))
`summarise()` ungrouping output (override with `.groups` argument)
# base R’s aggregate() can be helpful for summary functions that don’t return a scalar
group_by(iris, Species) %>% summarise(var=quantile(Sepal.Length))
`summarise()` regrouping output by 'Species' (override with `.groups` argument)
aggregate(x=iris$Sepal.Length, by=list(Species=iris$Species), FUN="quantile")
NA
aggregate(x=iris[-5], by=list(Species=iris$Species), FUN="quantile")
NA
NA
#base R’s by (). By() applies to data frames
by(data=iris[,1:4], INDICES=iris$Species, FUN=summary)
iris$Species: setosa
Sepal.Length Sepal.Width Petal.Length Petal.Width
Min. :4.300 Min. :2.300 Min. :1.000 Min. :0.100
1st Qu.:4.800 1st Qu.:3.200 1st Qu.:1.400 1st Qu.:0.200
Median :5.000 Median :3.400 Median :1.500 Median :0.200
Mean :5.006 Mean :3.428 Mean :1.462 Mean :0.246
3rd Qu.:5.200 3rd Qu.:3.675 3rd Qu.:1.575 3rd Qu.:0.300
Max. :5.800 Max. :4.400 Max. :1.900 Max. :0.600
-------------------------------------------------------------------------------------------------
iris$Species: versicolor
Sepal.Length Sepal.Width Petal.Length Petal.Width
Min. :4.900 Min. :2.000 Min. :3.00 Min. :1.000
1st Qu.:5.600 1st Qu.:2.525 1st Qu.:4.00 1st Qu.:1.200
Median :5.900 Median :2.800 Median :4.35 Median :1.300
Mean :5.936 Mean :2.770 Mean :4.26 Mean :1.326
3rd Qu.:6.300 3rd Qu.:3.000 3rd Qu.:4.60 3rd Qu.:1.500
Max. :7.000 Max. :3.400 Max. :5.10 Max. :1.800
-------------------------------------------------------------------------------------------------
iris$Species: virginica
Sepal.Length Sepal.Width Petal.Length Petal.Width
Min. :4.900 Min. :2.200 Min. :4.500 Min. :1.400
1st Qu.:6.225 1st Qu.:2.800 1st Qu.:5.100 1st Qu.:1.800
Median :6.500 Median :3.000 Median :5.550 Median :2.000
Mean :6.588 Mean :2.974 Mean :5.552 Mean :2.026
3rd Qu.:6.900 3rd Qu.:3.175 3rd Qu.:5.875 3rd Qu.:2.300
Max. :7.900 Max. :3.800 Max. :6.900 Max. :2.500
while (!is.null(dev.list())) dev.off() #close the device
library(ggplot2)
data(algae, package="DMwR2")
freqOcc <- table(algae$season)
barplot(freqOcc, main="Frequency of the Seasons")
ggplot(algae, aes(x=season)) + geom_bar() + ggtitle("Frequency of the Seasons")
theme_update(plot.title = element_text(hjust=0.5))
ggplot(algae, aes(x=season)) + geom_bar() + ggtitle("Frequency of the Seasons")
ggplot(algae, aes(x=season, color=season)) + geom_bar() + ggtitle("Frequency of the Seasons")
ggplot(algae, aes(x=season, fill=season)) + geom_bar() + ggtitle("Frequency of the Seasons")
#continuous variables
hist(algae$mxPH, xlab = "Maximal pH")
ggplot(algae, aes(x=mxPH)) + geom_histogram() + xlab("Maximal pH")
NA
NA
boxplot(algae$mxPH, ylab="Maximal pH")
ggplot(algae, aes(y=mxPH)) + geom_boxplot() + ylab("Maximal pH") + theme(axis.text.x = element_blank())
boxplot(mxPH ~ season, algae, ylab="Maximal pH", xlab="Seasons")
ggplot(algae, aes(x=season, y=mxPH)) + geom_boxplot() + ylab("Maximal pH") + xlab("Seasons")
ggplot(algae, aes(x=a1)) + geom_histogram() + facet_grid(size~speed)
ggplot(algae, aes(x=a1)) + geom_histogram() + facet_grid(.~speed)
ggplot(algae, aes(x=speed, y=a1)) + geom_boxplot() + facet_grid(size~season)
#scatter plots
plot(algae$a1, algae$a2, main="Relationships btw a1 and a2")
ggplot(algae, aes(x=a1, y=a2)) + geom_point() + ggtitle("Relationship btw a1 and a2")
plot(algae$a1, algae$a2, col=algae$season, main="Relationships btw a1 and a2")
ggplot(algae, aes(x=a1, y=a2, color=season)) + geom_point() + ggtitle("Relationship btw a1 and a2")
ggplot(algae, aes(x=a1, y=a2)) + geom_point()+ggtitle("Relationship btw a1 and a2") + facet_grid(.~season)
ggplot(algae, aes(x=a1, y=a2)) + geom_point() + ggtitle("Relationship btw a1 and a2") + facet_wrap(~season)
pairs(algae[, 12:16])
install.packages("GGally")
Error in install.packages : Updating loaded packages
library(GGally)
ggpairs(algae, columns=12:16)
install.packages("GGally")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Warning in install.packages :
package ‘GGally’ is in use and will not be installed
ggpairs(algae, columns=2:5)
#what types of plots are shown and why?
#sine plot, histogram plot, bar plot, box plot, scatter plot
#for size and speed columns the bar plot and the box plot have been applied because these vectors represent discreet variable and the box plot have been used to represent Five-number summary of a distribution, Minimum, Q1, Median, Q3, Maximum
# for mxPH and mnO2 sine plot, histogram plot, and scatter plot have been used which represent continuous variable and sine plot provides information regarding Median, mean and mode of symmetric, positively and negatively skewed data and scatter plot Provides a first look at bivariate numerical data to see clusters of points, outliers, etc
algae
h <- c(3.5, 2.6, 4.0, 3.2, 4.5, 3.3 ) #height values
length(h)
[1] 6
w <- c(13.5, 12.6, 14.0, 13.2, 14.5, 13.3 ) #weight values
length(w)
[1] 6
qqplot(h, w) #h and w values are sorted, paired, and then plotted.
abline(lsfit(h,w)) #fit a line using the least square
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
#HW2
x <- c(13, 15, 16, 16, 19, 20, 20, 21, 22, 22, 25, 25, 25, 25, 30,
33, 33, 35, 35, 35, 35, 36, 40, 45, 46, 52, 70)
result.mean <- mean(x,na.rm = TRUE)
print(result.mean)
[1] 29.96296
result.median <- median(x,na.rm = TRUE)
print(result.median)
[1] 25
min(x)
[1] 13
max(x)
[1] 70
mid_rangr=(min(x)+max(x))/2
print(mid_rangr)
[1] 41.5
quantile(x)
0% 25% 50% 75% 100%
13.0 20.5 25.0 35.0 70.0
summary(x)
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.00 20.50 25.00 29.96 35.00 70.00
boxplot(x, ylab="age value")
age <- c(23, 23, 27, 27, 39, 41, 47, 49, 50, 52, 54, 54, 56, 57, 58, 58, 60, 61)
fat <- c(9.5, 26.5, 7.8, 17.8, 31.4, 25.9, 27.4, 27.2, 31.2, 34.6, 42.5, 28.8, 33.4, 30.2, 34.1, 32.9, 41.2, 35.7)
result.mean <- mean(age,na.rm = TRUE)
print(result.mean)
[1] 46.44444
result.mean <- mean(fat,na.rm = TRUE)
print(result.mean)
[1] 28.78333
result.median <- median(age,na.rm = TRUE)
print(result.median)
[1] 51
result.median <- median(fat,na.rm = TRUE)
print(result.median)
[1] 30.7
sd(age)
[1] 13.21862
sd(fat)
[1] 9.254395
boxplot(age, ylab="age value")
boxplot(fat, ylab="%fat value")
plot(fat, age, main="Relationships btw age and %fat")
qqplot(fat, age)
qqplot(fat, age)
abline(lsfit(fat,age))
x1<-c(1.5,1.7)
x2<-c(2,1.9)
x3<-c(1.6,1.8)
x4<-c(1.2,1.5)
x5<-c(1.5,1)
x6<-c(1.4,1.6)
sqrt(sum((x1-x6)^2))
[1] 0.1414214
sqrt(sum((x2-x6)^2))
[1] 0.6708204
sqrt(sum((x3-x6)^2))
[1] 0.2828427
sqrt(sum((x4-x6)^2))
[1] 0.2236068
sqrt(sum((x5-x6)^2))
[1] 0.6082763