#install.packages("dplyr")
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("stringr")
library(stringr)

#install.packages("assertr")
library(assertr)

#install.packages("ggplot2")
library(ggplot2)

#install.packages("lubridate")
library(lubridate)
## 
## Adjuntando el paquete: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
#install.packages("readxl")
library(readxl)

#install.packages("forcats")
library(forcats)

#install.packages("visdat")
library(visdat)

#install.packages("stringdist")
library(stringdist)

#install.packages("fuzzyjoin")
library(fuzzyjoin)

#install.packages("gapminder")
library(gapminder)

#install.packages("e1071")
library(e1071)

#install.packages("datasets")
library(datasets)

1. The following table contains the ages of people who are frequent customers of a vegetarian restaurant. With this data, calculate::

  1. The mean
  2. The median
  3. The mode
  4. The frequency histogram
28 23 31 31 21
30 28 23 29 36
36 31 28 28 40
22 21 28 28 31
# Datos de la tabla de arriba
ages <- c(28, 23, 31, 31, 21, 30, 28, 23, 29, 36, 36, 31, 28, 28, 40, 22, 21, 28, 28, 31)

a) The mean

mean <- mean(ages)
print(mean)
## [1] 28.65

b) The median

median <- median(ages)
print(median)
## [1] 28

c) The mode

modo <- ages %>%
  table() %>%
  which.max() %>%
  names() %>%
  as.numeric()
print(modo)
## [1] 28

d) The frequency histogram

ggplot(data.frame(ages), aes(x = ages)) +
  geom_histogram(binwidth = 2, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "frequency histogram", x = "ages", y = "frequency") +
  theme_minimal()

2. With the data in the table above, determine the variance and standard deviation.

# variance
varianza <- var(ages)
print(varianza)
## [1] 25.71316
# standard deviation
desviacion_estandar <- sd(ages)
print(desviacion_estandar)
## [1] 5.070814

3. Using the mean and two standard deviation, determine the Chebyshev interval and interpret its meaning in the context of the problem.

k <- 2
limite_inferior <- median - k * desviacion_estandar
limite_superior <- median + k * desviacion_estandar
print(paste("El intervalo de Chebyshev para k=2 es:", limite_inferior, "a", limite_superior))
## [1] "El intervalo de Chebyshev para k=2 es: 17.858371354711 a 38.141628645289"

This means that, according to Chebyshev’s theorem, at least 75% of the age values will fall within this range of 13.03 to 43.27 years.

4. Explain what the Central Tendency measures indicate and what the dispersion measures indicate.

Measures of Central Tendency: Mean: It’s the average. It tells you the “typical” or most common value of the data.

Median: It’s the middle value when the data is sorted. It splits the data into two equal parts.

Mode: It’s the value that appears the most in the data.

What do they indicate?: These measures give you an idea of what the most typical or common value in the data set is.

Measures of Dispersion: Variance: It measures how spread out the data is. If it’s high, the data points are far apart.

Standard Deviation: It’s the square root of the variance and also measures how spread out the data is, but in the same units as the data.

What do they indicate?: These measures show you how “spread out” or “grouped together” the data is around the mean. If they’re low, the data is close to the mean; if they’re high, the data is more spread out.

5. Using the data from the database titled POP BEVERAGES CONSUMPTION SURVEY, use EXCEL to determine, for each age category of consumers, the following about the Glasses of soft drink per day:

ruta_archivo <- file.choose()
pop<- read_excel(ruta_archivo)
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
pop<- as.data.frame(pop)

youth_rural <- read_excel(ruta_archivo, sheet = "Youth Rural", skip = 2)
## New names:
## • `` -> `...5`
youth_urban <- read_excel(ruta_archivo, sheet = "Youth Urban", skip = 2)
## New names:
## • `` -> `...5`
adults_rural <- read_excel(ruta_archivo, sheet = "20-40 Rural", skip = 2)
## New names:
## • `` -> `...5`
adults_urban <- read_excel(ruta_archivo, sheet = "20-40 Urban", skip = 2)
## New names:
## • `` -> `...5`
seniors_rural <- read_excel(ruta_archivo, sheet = "40-60 Rural", skip = 2)
## New names:
## • `` -> `...5`
seniors_urban <- read_excel(ruta_archivo, sheet = "40-60 Urban", skip = 2)
## New names:
## • `` -> `...5`

a) The histogram frequency for each age category in rural area and the bias presented by the data from the graph comparing the mean to the median

plot_histogram <- function(df, title) {
  mean_value <- mean(df$`Glasses of soft drink a day`, na.rm = TRUE)
  median_value <- median(df$`Glasses of soft drink a day`, na.rm = TRUE)
  skewness <- mean_value - median_value
  
  p <- ggplot(df, aes(x = `Glasses of soft drink a day`)) +
    geom_histogram(binwidth = 1, fill = "blue", alpha = 0.7, color = "black") +
    geom_vline(aes(xintercept = mean_value), color = "red", linetype = "dashed", size = 1) +
    geom_vline(aes(xintercept = median_value), color = "green", linetype = "dashed", size = 1) +
    labs(title = title, x = "Soft drink glasses per day", y = "Frequency") +
    theme_minimal()
  
  return(list(plot = p, skewness = skewness))
}
hist_youth_rural <- plot_histogram(youth_rural, "Consumo de refresco - Jóvenes (Rural)")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
hist_adults_rural <- plot_histogram(adults_rural, "Consumo de refresco - Adultos (Rural)")
hist_seniors_rural <- plot_histogram(seniors_rural, "Consumo de refresco - Mayores (Rural)")

print(hist_youth_rural$plot)

print(hist_adults_rural$plot)

print(hist_seniors_rural$plot)

b) The histogram frequency for each age category in urban area and the bias presented by the data from the graph comparing the mean to the median

hist_youth_urban <- plot_histogram(youth_urban, "Soft Drink Consumption - Youth (Urban)")
hist_adults_urban <- plot_histogram(adults_urban, "Soft Drink Consumption - Adults (Urban)")
hist_seniors_urban <- plot_histogram(seniors_urban, "Soft Drink Consumption - Seniors (Urban)")
print(hist_youth_urban$plot)

print(hist_adults_urban$plot)

print(hist_seniors_urban$plot)

skewness_comparison <- data.frame(
  Category = c("Youth", "Youth", "Adults", "Adults", "Seniors", "Seniors"),
  Area = c("Rural", "Urban", "Rural", "Urban", "Rural", "Urban"),
  Skewness = c(hist_youth_rural$skewness, hist_youth_urban$skewness, 
               hist_adults_rural$skewness, hist_adults_urban$skewness, 
               hist_seniors_rural$skewness, hist_seniors_urban$skewness)
)

print(skewness_comparison)
##   Category  Area   Skewness
## 1    Youth Rural -0.1666667
## 2    Youth Urban  0.0000000
## 3   Adults Rural  0.4400000
## 4   Adults Urban  0.3400000
## 5  Seniors Rural  0.1000000
## 6  Seniors Urban  0.0000000
