install.packages("ggplot2", repos = "https://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/d0/72tl9f2x3qq6ykz23_3z7pdw0000gn/T//RtmpbUR7yp/downloaded_packages
install.packages("MASS", repos = "https://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/d0/72tl9f2x3qq6ykz23_3z7pdw0000gn/T//RtmpbUR7yp/downloaded_packages
library(ggplot2)
library(MASS)
Problem 1.10
The three variables in the data set are Tree, Age, and
Circumference
data(Orange)
colnames(Orange)
## [1] "Tree" "age" "circumference"
Problem 1.11
The average age of the tress is 922.1429
mean(Orange$age)
## [1] 922.1429
Problem 1.12
The largest ciscumference is 214
max(Orange$circumference)
## [1] 214
Problem 2.4
part1 <- rep("a", times = 5)
print(part1)
## [1] "a" "a" "a" "a" "a"
part2 <- seq(1, 100, by = 2)
print(part2)
## [1] 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49
## [26] 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95 97 99
part3 <- rep(c(1,2,3), each = 3)
print(part3)
## [1] 1 1 1 2 2 2 3 3 3
part4 <- rep(c(1,2,3), times = c(3,2,1))
print(part4)
## [1] 1 1 1 2 2 3
part5 <- c(1:5, 4:1)
print(part5)
## [1] 1 2 3 4 5 4 3 2 1
Problem 2.20
The average sales for the months with 31 days is 166.5714
The average sales for the months without 31 days is 205.6
This means that the months with 31 days actually has less sales than
those without 31 days.
cd <- data.frame(month = character(), sales = numeric())
temp_month <- c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")
temp_sales <- c(79, 74, 161, 127, 133, 210, 99, 143, 249, 249, 368, 302)
cd <- data.frame(month = temp_month, sales = temp_sales)
cd
## month sales
## 1 JAN 79
## 2 FEB 74
## 3 MAR 161
## 4 APR 127
## 5 MAY 133
## 6 JUN 210
## 7 JUL 99
## 8 AUG 143
## 9 SEP 249
## 10 OCT 249
## 11 NOV 368
## 12 DEC 302
with31 <- cd[c(1, 3, 5, 7, 8, 10, 12), ]
wo31 <- cd[-c(1, 3, 5, 7, 8, 10, 12), ]
mean(with31$sales)
## [1] 166.5714
mean(wo31$sales)
## [1] 205.6
Problem 2.21
In year 1995 the amount dropped by 0.11
The year with the biggest percentage difference is 1991 where the
difference was 56.14 percent more than the previous year.
bsball <- c()
year <- c(1990:1999) # optional
bsball$year <- year
salary <- c(0.57, 0.89, 1.08, 1.12, 1.18, 1.07, 1.17, 1.38, 1.44, 1.72)
bsball$salary <- salary
bsball$diff <- c(NA, diff(bsball$salary))
bsball <- data.frame(bsball)
bsball$percent_diff <- c(NA, (diff(bsball$salary) / head(bsball$salary, -1)) * 100)
print(bsball)
## year salary diff percent_diff
## 1 1990 0.57 NA NA
## 2 1991 0.89 0.32 56.140351
## 3 1992 1.08 0.19 21.348315
## 4 1993 1.12 0.04 3.703704
## 5 1994 1.18 0.06 5.357143
## 6 1995 1.07 -0.11 -9.322034
## 7 1996 1.17 0.10 9.345794
## 8 1997 1.38 0.21 17.948718
## 9 1998 1.44 0.06 4.347826
## 10 1999 1.72 0.28 19.444444
Problem 2.23
The variance function will always be positive because the variance
from 1 through 10 is 8.25.
prob_23 <- function(x) {
mean(x^2) - (mean(x))^2
}
prob_23(1:10)
## [1] 8.25
Problem 2.42
1. The proportion that is less than 500 miles long is 58.16%
2. The proportion that is less than the mean length is 66.67%
3. The 75% quantile is 680.
data(rivers)
# 1
# using T and F to get portion less than 500
prop_500 <- mean(rivers < 500)
print(prop_500)
## [1] 0.5815603
# 2
mean_l <- mean(rivers)
prop_less <- mean(rivers < mean_l)
print(prop_less)
## [1] 0.6666667
# 3
q_75 <- quantile(rivers, 0.75)
print(q_75)
## 75%
## 680
Problem 2.44
The mean is 591.1844
The median is 425
The trimed mean is 449.9155.
mean(rivers)
## [1] 591.1844
median(rivers)
## [1] 425
mean(rivers, trim = .25)
## [1] 449.9155
Problem 2.47
The mean is e to the -17th power so we can count it as zero.
The standard deviation is 1
The Shape of the data is right skewed and appears to have a high
kurtosis around 500 river length. There appears to be some outliers
present in the high 3000s and mid 2000s length. The most common lengths
are close to and less than 500.
z_scores <- scale(rivers)
head(z_scores)
## [,1]
## [1,] 0.2912008
## [2,] -0.5490998
## [3,] -0.5389757
## [4,] -0.4033127
## [5,] -0.1360364
## [6,] -0.2858731
mean(z_scores)
## [1] -5.669224e-17
sd(z_scores)
## [1] 1
ggplot(data.frame(rivers = rivers), aes(x = rivers)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "blue", color = "black") +
geom_density(color = "yellow", linewidth = 1) +
labs(title = "Histogram with Density Plot of Rivers",
x = "River Length",
y = "Density") +
theme_minimal()

ggplot(data.frame(rivers = rivers), aes(x = "", y = rivers)) +
geom_boxplot(color = "black") +
labs(title = "Boxplot of Rivers",
x = " ",
y = "River Length") +
theme_minimal()

Problem 2.62
Factor finds the different possibilities for the variable called
levels. They are 3,4,5,6,8, and rotary.
data("Cars93")
# The different levels in the variable
Cars93$Cylinders
## [1] 4 6 6 6 4 4 6 6 6 8
## [11] 8 4 4 6 4 6 6 8 8 6
## [21] 4 6 4 4 4 6 4 6 4 6
## [31] 4 4 4 4 4 6 6 8 3 4
## [41] 4 4 4 4 4 4 4 8 6 6
## [51] 6 8 4 4 4 6 rotary 4 6 4
## [61] 6 4 6 4 4 6 6 4 4 6
## [71] 6 4 4 4 6 6 6 4 4 3
## [81] 4 4 3 4 4 4 4 4 5 4
## [91] 6 4 5
## Levels: 3 4 5 6 8 rotary
Problem 2.64
ggplot(Cars93, aes(x = Cylinders)) +
geom_bar() +
labs(title = "Bar Graph of Cylinders",
x = "Number of Cylinders",
y = "Count") +
theme_minimal()
