install.packages("ggplot2", repos = "https://cran.rstudio.com/")
## 
## The downloaded binary packages are in
##  /var/folders/d0/72tl9f2x3qq6ykz23_3z7pdw0000gn/T//RtmpbUR7yp/downloaded_packages
install.packages("MASS", repos = "https://cran.rstudio.com/")
## 
## The downloaded binary packages are in
##  /var/folders/d0/72tl9f2x3qq6ykz23_3z7pdw0000gn/T//RtmpbUR7yp/downloaded_packages
library(ggplot2)

library(MASS)

Problem 1.10

The three variables in the data set are Tree, Age, and Circumference
data(Orange)
colnames(Orange)
## [1] "Tree"          "age"           "circumference"

Problem 1.11

The average age of the tress is 922.1429
mean(Orange$age)
## [1] 922.1429

Problem 1.12

The largest ciscumference is 214
max(Orange$circumference)
## [1] 214

Problem 2.4

part1 <- rep("a", times = 5)
print(part1)
## [1] "a" "a" "a" "a" "a"
part2 <- seq(1, 100, by = 2)
print(part2)
##  [1]  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49
## [26] 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95 97 99
part3 <- rep(c(1,2,3), each = 3)
print(part3)
## [1] 1 1 1 2 2 2 3 3 3
part4 <- rep(c(1,2,3), times = c(3,2,1))
print(part4)
## [1] 1 1 1 2 2 3
part5 <- c(1:5, 4:1)
print(part5)
## [1] 1 2 3 4 5 4 3 2 1

Problem 2.20

The average sales for the months with 31 days is 166.5714
The average sales for the months without 31 days is 205.6
This means that the months with 31 days actually has less sales than those without 31 days.
cd <- data.frame(month = character(), sales = numeric())

temp_month <- c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")
temp_sales <- c(79, 74, 161, 127, 133, 210, 99, 143, 249, 249, 368, 302)

cd <- data.frame(month = temp_month, sales = temp_sales)
cd
##    month sales
## 1    JAN    79
## 2    FEB    74
## 3    MAR   161
## 4    APR   127
## 5    MAY   133
## 6    JUN   210
## 7    JUL    99
## 8    AUG   143
## 9    SEP   249
## 10   OCT   249
## 11   NOV   368
## 12   DEC   302
with31 <- cd[c(1, 3, 5, 7, 8, 10, 12), ]
wo31 <- cd[-c(1, 3, 5, 7, 8, 10, 12), ]

mean(with31$sales)
## [1] 166.5714
mean(wo31$sales)
## [1] 205.6

Problem 2.21

In year 1995 the amount dropped by 0.11
The year with the biggest percentage difference is 1991 where the difference was 56.14 percent more than the previous year.
bsball <- c()

year <- c(1990:1999) # optional
bsball$year <- year

salary <- c(0.57, 0.89, 1.08, 1.12, 1.18, 1.07, 1.17, 1.38, 1.44, 1.72)
bsball$salary <- salary
bsball$diff <- c(NA, diff(bsball$salary))
bsball <- data.frame(bsball)
bsball$percent_diff <- c(NA, (diff(bsball$salary) / head(bsball$salary, -1)) * 100)
print(bsball)
##    year salary  diff percent_diff
## 1  1990   0.57    NA           NA
## 2  1991   0.89  0.32    56.140351
## 3  1992   1.08  0.19    21.348315
## 4  1993   1.12  0.04     3.703704
## 5  1994   1.18  0.06     5.357143
## 6  1995   1.07 -0.11    -9.322034
## 7  1996   1.17  0.10     9.345794
## 8  1997   1.38  0.21    17.948718
## 9  1998   1.44  0.06     4.347826
## 10 1999   1.72  0.28    19.444444

Problem 2.23

The variance function will always be positive because the variance from 1 through 10 is 8.25.
prob_23 <- function(x) {
  mean(x^2) - (mean(x))^2
}

prob_23(1:10)
## [1] 8.25

Problem 2.42

1. The proportion that is less than 500 miles long is 58.16%
2. The proportion that is less than the mean length is 66.67%
3. The 75% quantile is 680.
data(rivers)

# 1
# using T and F to get portion less than 500
prop_500 <- mean(rivers < 500)
print(prop_500)
## [1] 0.5815603
# 2
mean_l <- mean(rivers)

prop_less <- mean(rivers < mean_l)
print(prop_less)
## [1] 0.6666667
# 3
q_75 <- quantile(rivers, 0.75)
print(q_75)
## 75% 
## 680

Problem 2.44

The mean is 591.1844
The median is 425
The trimed mean is 449.9155.
mean(rivers)
## [1] 591.1844
median(rivers)
## [1] 425
mean(rivers, trim = .25)
## [1] 449.9155

Problem 2.47

The mean is e to the -17th power so we can count it as zero.
The standard deviation is 1
The Shape of the data is right skewed and appears to have a high kurtosis around 500 river length. There appears to be some outliers present in the high 3000s and mid 2000s length. The most common lengths are close to and less than 500.
z_scores <- scale(rivers)
head(z_scores)
##            [,1]
## [1,]  0.2912008
## [2,] -0.5490998
## [3,] -0.5389757
## [4,] -0.4033127
## [5,] -0.1360364
## [6,] -0.2858731
mean(z_scores)
## [1] -5.669224e-17
sd(z_scores)
## [1] 1
ggplot(data.frame(rivers = rivers), aes(x = rivers)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "blue", color = "black") +
  geom_density(color = "yellow", linewidth = 1) +
  labs(title = "Histogram with Density Plot of Rivers",
       x = "River Length",
       y = "Density") +
  theme_minimal()

ggplot(data.frame(rivers = rivers), aes(x = "", y = rivers)) +
  geom_boxplot(color = "black") +
  labs(title = "Boxplot of Rivers",
       x = " ",
       y = "River Length") +
  theme_minimal()

Problem 2.62

Factor finds the different possibilities for the variable called levels. They are 3,4,5,6,8, and rotary.

data("Cars93")
# The different levels in the variable
Cars93$Cylinders
##  [1] 4      6      6      6      4      4      6      6      6      8     
## [11] 8      4      4      6      4      6      6      8      8      6     
## [21] 4      6      4      4      4      6      4      6      4      6     
## [31] 4      4      4      4      4      6      6      8      3      4     
## [41] 4      4      4      4      4      4      4      8      6      6     
## [51] 6      8      4      4      4      6      rotary 4      6      4     
## [61] 6      4      6      4      4      6      6      4      4      6     
## [71] 6      4      4      4      6      6      6      4      4      3     
## [81] 4      4      3      4      4      4      4      4      5      4     
## [91] 6      4      5     
## Levels: 3 4 5 6 8 rotary

Problem 2.64

ggplot(Cars93, aes(x = Cylinders)) +
  geom_bar() +
  labs(title = "Bar Graph of Cylinders",
       x = "Number of Cylinders",
       y = "Count") + 
  theme_minimal()