install.packages("ggplot2", repos = "https://cran.rstudio.com/")

## 
## The downloaded binary packages are in
##  /var/folders/d0/72tl9f2x3qq6ykz23_3z7pdw0000gn/T//RtmpbUR7yp/downloaded_packages

install.packages("MASS", repos = "https://cran.rstudio.com/")

## 
## The downloaded binary packages are in
##  /var/folders/d0/72tl9f2x3qq6ykz23_3z7pdw0000gn/T//RtmpbUR7yp/downloaded_packages

library(ggplot2)

library(MASS)

Problem 1.10

The three variables in the data set are Tree, Age, and Circumference

data(Orange)
colnames(Orange)

## [1] "Tree"          "age"           "circumference"

Problem 1.11

The average age of the tress is 922.1429

mean(Orange$age)

## [1] 922.1429

Problem 1.12

The largest ciscumference is 214

max(Orange$circumference)

## [1] 214

Problem 2.4

part1 <- rep("a", times = 5)
print(part1)

## [1] "a" "a" "a" "a" "a"

part2 <- seq(1, 100, by = 2)
print(part2)

##  [1]  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49
## [26] 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95 97 99

part3 <- rep(c(1,2,3), each = 3)
print(part3)

## [1] 1 1 1 2 2 2 3 3 3

part4 <- rep(c(1,2,3), times = c(3,2,1))
print(part4)

## [1] 1 1 1 2 2 3

part5 <- c(1:5, 4:1)
print(part5)

## [1] 1 2 3 4 5 4 3 2 1

Problem 2.20

The average sales for the months with 31 days is 166.5714

The average sales for the months without 31 days is 205.6

This means that the months with 31 days actually has less sales than those without 31 days.

cd <- data.frame(month = character(), sales = numeric())

temp_month <- c("JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC")
temp_sales <- c(79, 74, 161, 127, 133, 210, 99, 143, 249, 249, 368, 302)

cd <- data.frame(month = temp_month, sales = temp_sales)
cd

##    month sales
## 1    JAN    79
## 2    FEB    74
## 3    MAR   161
## 4    APR   127
## 5    MAY   133
## 6    JUN   210
## 7    JUL    99
## 8    AUG   143
## 9    SEP   249
## 10   OCT   249
## 11   NOV   368
## 12   DEC   302

with31 <- cd[c(1, 3, 5, 7, 8, 10, 12), ]
wo31 <- cd[-c(1, 3, 5, 7, 8, 10, 12), ]

mean(with31$sales)

## [1] 166.5714

mean(wo31$sales)

## [1] 205.6

Problem 2.21

In year 1995 the amount dropped by 0.11

The year with the biggest percentage difference is 1991 where the difference was 56.14 percent more than the previous year.

bsball <- c()

year <- c(1990:1999) # optional
bsball$year <- year

salary <- c(0.57, 0.89, 1.08, 1.12, 1.18, 1.07, 1.17, 1.38, 1.44, 1.72)
bsball$salary <- salary
bsball$diff <- c(NA, diff(bsball$salary))
bsball <- data.frame(bsball)
bsball$percent_diff <- c(NA, (diff(bsball$salary) / head(bsball$salary, -1)) * 100)
print(bsball)

##    year salary  diff percent_diff
## 1  1990   0.57    NA           NA
## 2  1991   0.89  0.32    56.140351
## 3  1992   1.08  0.19    21.348315
## 4  1993   1.12  0.04     3.703704
## 5  1994   1.18  0.06     5.357143
## 6  1995   1.07 -0.11    -9.322034
## 7  1996   1.17  0.10     9.345794
## 8  1997   1.38  0.21    17.948718
## 9  1998   1.44  0.06     4.347826
## 10 1999   1.72  0.28    19.444444

Problem 2.23

The variance function will always be positive because the variance from 1 through 10 is 8.25.

prob_23 <- function(x) {
  mean(x^2) - (mean(x))^2
}

prob_23(1:10)

## [1] 8.25

Problem 2.42

1. The proportion that is less than 500 miles long is 58.16%

2. The proportion that is less than the mean length is 66.67%

3. The 75% quantile is 680.

data(rivers)

# 1
# using T and F to get portion less than 500
prop_500 <- mean(rivers < 500)
print(prop_500)

## [1] 0.5815603

# 2
mean_l <- mean(rivers)

prop_less <- mean(rivers < mean_l)
print(prop_less)

## [1] 0.6666667

# 3
q_75 <- quantile(rivers, 0.75)
print(q_75)

## 75% 
## 680

Problem 2.44

The mean is 591.1844

The median is 425

The trimed mean is 449.9155.

mean(rivers)

## [1] 591.1844

median(rivers)

## [1] 425

mean(rivers, trim = .25)

## [1] 449.9155

Problem 2.47

The mean is e to the -17th power so we can count it as zero.

The standard deviation is 1

The Shape of the data is right skewed and appears to have a high kurtosis around 500 river length. There appears to be some outliers present in the high 3000s and mid 2000s length. The most common lengths are close to and less than 500.

z_scores <- scale(rivers)
head(z_scores)

##            [,1]
## [1,]  0.2912008
## [2,] -0.5490998
## [3,] -0.5389757
## [4,] -0.4033127
## [5,] -0.1360364
## [6,] -0.2858731

mean(z_scores)

## [1] -5.669224e-17

sd(z_scores)

## [1] 1

ggplot(data.frame(rivers = rivers), aes(x = rivers)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "blue", color = "black") +
  geom_density(color = "yellow", linewidth = 1) +
  labs(title = "Histogram with Density Plot of Rivers",
       x = "River Length",
       y = "Density") +
  theme_minimal()

ggplot(data.frame(rivers = rivers), aes(x = "", y = rivers)) +
  geom_boxplot(color = "black") +
  labs(title = "Boxplot of Rivers",
       x = " ",
       y = "River Length") +
  theme_minimal()

Problem 2.62

Factor finds the different possibilities for the variable called levels. They are 3,4,5,6,8, and rotary.

data("Cars93")
# The different levels in the variable
Cars93$Cylinders

##  [1] 4      6      6      6      4      4      6      6      6      8     
## [11] 8      4      4      6      4      6      6      8      8      6     
## [21] 4      6      4      4      4      6      4      6      4      6     
## [31] 4      4      4      4      4      6      6      8      3      4     
## [41] 4      4      4      4      4      4      4      8      6      6     
## [51] 6      8      4      4      4      6      rotary 4      6      4     
## [61] 6      4      6      4      4      6      6      4      4      6     
## [71] 6      4      4      4      6      6      6      4      4      3     
## [81] 4      4      3      4      4      4      4      4      5      4     
## [91] 6      4      5     
## Levels: 3 4 5 6 8 rotary

Problem 2.64

ggplot(Cars93, aes(x = Cylinders)) +
  geom_bar() +
  labs(title = "Bar Graph of Cylinders",
       x = "Number of Cylinders",
       y = "Count") + 
  theme_minimal()

Assignment 6

Jose Ojea

2025-03-19

Problem 1.10

The three variables in the data set are Tree, Age, and Circumference

Problem 1.11

The average age of the tress is 922.1429

Problem 1.12

The largest ciscumference is 214

Problem 2.4

Problem 2.20

The average sales for the months with 31 days is 166.5714

The average sales for the months without 31 days is 205.6

This means that the months with 31 days actually has less sales than those without 31 days.

Problem 2.21

In year 1995 the amount dropped by 0.11

The year with the biggest percentage difference is 1991 where the difference was 56.14 percent more than the previous year.

Problem 2.23

The variance function will always be positive because the variance from 1 through 10 is 8.25.

Problem 2.42

1. The proportion that is less than 500 miles long is 58.16%

2. The proportion that is less than the mean length is 66.67%

3. The 75% quantile is 680.

Problem 2.44

The mean is 591.1844

The median is 425

The trimed mean is 449.9155.

Problem 2.47

The mean is e to the -17th power so we can count it as zero.

The standard deviation is 1

The Shape of the data is right skewed and appears to have a high kurtosis around 500 river length. There appears to be some outliers present in the high 3000s and mid 2000s length. The most common lengths are close to and less than 500.

Problem 2.62

Factor finds the different possibilities for the variable called levels. They are 3,4,5,6,8, and rotary.

Problem 2.64