library(tidyverse)
library(openintro)
library(broom)
library(GGally)

Import dataset

sea_weather<-read.csv("https://raw.githubusercontent.com/trevorpelletier/2020Spring/master/Seattle_weather.csv")
glimpse(sea_weather)
## Rows: 25,202
## Columns: 7
## $ DATE  <chr> "1949-01-01", "1949-01-02", "1949-01-03", "1949-01-04", "1949-01…
## $ PRCP  <dbl> 0.00, 0.03, 0.00, 0.00, 0.00, 0.03, 0.52, 0.00, 0.00, 0.00, 0.00…
## $ TMAX  <int> 35, 40, 40, 42, 41, 49, 46, 30, 32, 30, 37, 36, 41, 36, 46, 43, …
## $ TMIN  <int> 26, 22, 17, 20, 28, 37, 26, 18, 13, 12, 17, 19, 19, 25, 28, 26, …
## $ year  <int> 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…

##Filter our my birthmonth

# Then filter for February
feb_sea_weather <- sea_weather %>%
  filter(month(DATE) == 2)

# Check the output
glimpse(feb_sea_weather)
## Rows: 1,949
## Columns: 7
## $ DATE  <chr> "1949-02-01", "1949-02-02", "1949-02-03", "1949-02-04", "1949-02…
## $ PRCP  <dbl> 0.00, 0.10, 0.03, 0.11, 0.06, 0.06, 0.00, 0.00, 0.43, 0.96, 0.02…
## $ TMAX  <int> 37, 33, 35, 33, 35, 37, 38, 41, 43, 43, 37, 32, 34, 38, 42, 48, …
## $ TMIN  <int> 23, 25, 27, 24, 21, 31, 31, 33, 34, 30, 27, 21, 18, 30, 33, 41, …
## $ year  <int> 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949…
## $ month <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ day   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…

#Store February means and standard deviations

feb_high_mean <- mean(feb_sea_weather$TMAX,na.rm=TRUE)
feb_high_sd <- sd(feb_sea_weather$TMAX,na.rm=TRUE)
feb_low_mean <- mean(feb_sea_weather$TMIN,na.rm=TRUE)
feb_low_sd <- sd(feb_sea_weather$TMIN,na.rm=TRUE)
feb_high_mean
## [1] 49.05182
feb_high_sd
## [1] 6.208218
ggplot(feb_sea_weather,aes(x=TMAX))+
geom_histogram(aes(y=..density..),bins = 12,boundary=feb_high_mean)+
stat_function(fun = dnorm, args=c(feb_high_mean,feb_high_sd))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# When I use bins = 12 I can see a very good normal distribution mean=49.05182 and sd=6.208218 for TMAX in February.

# Compute the mean and standard deviation for TMAX in February
feb_high_mean <- mean(feb_sea_weather$TMAX, na.rm = TRUE)
feb_high_sd <- sd(feb_sea_weather$TMAX, na.rm = TRUE)

# Add a new column TMAX_z with the z-scores
feb_sea_weather <- feb_sea_weather %>%
  mutate(TMAX_z = (TMAX - feb_high_mean) / feb_high_sd)

# View the first few rows of the updated dataset
head(feb_sea_weather)
##         DATE PRCP TMAX TMIN year month day    TMAX_z
## 1 1949-02-01 0.00   37   23 1949     2   1 -1.941269
## 2 1949-02-02 0.10   33   25 1949     2   2 -2.585576
## 3 1949-02-03 0.03   35   27 1949     2   3 -2.263423
## 4 1949-02-04 0.11   33   24 1949     2   4 -2.585576
## 5 1949-02-05 0.06   35   21 1949     2   5 -2.263423
## 6 1949-02-06 0.06   37   31 1949     2   6 -1.941269

I created a new column called TMAXz for the Z scores for TMAX

sum_TMAX_z<-sum(feb_sea_weather$TMAX_z,na.rm=TRUE)
sum_TMAX_z
## [1] 1.950211e-13

#1.950211e-13 is not exactly 0 due to rounding errors but computationally can be treated as 0 since it is so very very small.

ggplot(feb_sea_weather,aes(x=TMAX_z))+
geom_histogram(aes(y=..density..),bins = 12,boundary=0, fill="blue",alpha=0.6)+
stat_function(fun = dnorm, args=c(0,1),color="red", size=1)+ labs(title="Density Histogram of TMAX Z-Scores with Standard Normal Distribution",x="Z-Score", y="Density")+theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Both the TMAX histogram and TMAX_z histograms are normally distributed into symmetric bell curves. The shapes are identical. This is because the spread of TMAX is normally distributed around the mean. In the Standard normal curve the mean corresponds to a Z of 0

my_count<-nrow(feb_sea_weather)
count_distribution<-data.frame(matrix(nrow=1, ncol = 4))
colnames(count_distribution) <- c("lowerz","upperz","expected","observed")

#Initialized an empty data frame to keep track of the range of z-scores (which is serving as our categories), the expected and observed counts in each category.

lowerz = -3
upperz = -2
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
##   lowerz upperz expected observed
## 1     -3     -2 41.70906       33

#The strategy heree is to think about several ranges of z-scores in step size 1, we will find out how many data points we expect to fall into each range based on area computations, and then we’ll find out how many data points actually do fall into each range based on the original data set. We’d like to track these intervals from z = -3 to z =3. The Code above is to compute the Expected number of data points in the z-interval [-3,-2) using area computations, and also the Observed number of data points in this interval. Then, add this information to our empty data frame.

lowerz = -2
upperz = -1
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
##   lowerz upperz expected observed
## 1     -2     -1 264.8791      202
lowerz = -1
upperz = 0
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
##   lowerz upperz expected observed
## 1     -1      0 665.2809      812
lowerz = 0
upperz =1
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
##   lowerz upperz expected observed
## 1      0      1 665.2809      617
lowerz = 1
upperz = 2
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
##   lowerz upperz expected observed
## 1      1      2 264.8791      220
lowerz = 2
upperz = 3
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
##   lowerz upperz expected observed
## 1      2      3 41.70906       53
count_distribution_long <- count_distribution %>% 
pivot_longer(c(expected, observed), names_to = "Group", values_to = "Counts")
print(count_distribution_long)
## # A tibble: 2 × 4
##   lowerz upperz Group    Counts
##    <dbl>  <dbl> <chr>     <dbl>
## 1      2      3 expected   41.7
## 2      2      3 observed   53

Prepping data for chi square distribution

ggplot(count_distribution_long, aes(x = lowerz, y = Counts, fill=Group))+
  geom_bar(stat="identity",position="dodge")

# in the Z =[2,3) The count is higher in the observed group when compared to the expected.

# First, compute my_count as the total number of rows in feb_sea_weather
my_count <- nrow(feb_sea_weather)

# Initialize an empty data frame to keep track of the range of z-scores
count_distribution <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(count_distribution) <- c("lowerz", "upperz", "Expected", "Observed")

# Loop over the range of z-scores from -3 to 2 (since upperz = lowerz + 1)
for (lowerz in seq(-3, 2, by = 1)) {
  upperz = lowerz + 1
  E = (pnorm(upperz) - pnorm(lowerz)) * my_count
  O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
  count_distribution <- rbind(count_distribution, data.frame(lowerz, upperz, Expected = E, Observed = O))
}

# Print the result
print(count_distribution)
##   lowerz upperz  Expected Observed
## 1     -3     -2  41.70906       33
## 2     -2     -1 264.87908      202
## 3     -1      0 665.28091      812
## 4      0      1 665.28091      617
## 5      1      2 264.87908      220
## 6      2      3  41.70906       53

#Modified R code so that I can see the Expected and Observed between Z -3 and 3 in 1 unit difference intervals

library(ggplot2)
library(tidyr)
library(dplyr)

# Ensure the data is in the correct structure
count_distribution <- data.frame(
  lowerz = c(-3, -2, -1, 0, 1, 2),
  upperz = c(-2, -1, 0, 1, 2, 3),
  Expected = c(41.70906, 264.87908, 665.28091, 665.28091, 264.87908, 41.70906),
  Observed = c(33, 202, 812, 617, 220, 53)
)

# Convert count_distribution to long format
count_distribution_long <- count_distribution %>%
  pivot_longer(cols = c("Expected", "Observed"),
               names_to = "Type",
               values_to = "Counts")

# Create the bar chart
ggplot(count_distribution_long, aes(x = factor(lowerz), y = Counts, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Observed vs Expected Counts for Z-Score Ranges",
       x = "Z-Score Range (Lower Bound)",
       y = "Count",
       fill = "Type") +
  theme_minimal()

# Observed versus expected groups for each unit Z interval between -3 and 3. I noticed that the expected values were greater than observed in each Z interval except for the Z=-1 to 0 interval and slightly more in the 2 to 3 interval. This is saying that our expected regression is doing a fairly good job through out the spectrum of Z values. The large difference in the center compensates for the lesser residuals in the other z intervals

library(ggplot2)
library(tidyr)
library(dplyr)
df_value <- 5  # Replace this with your degrees of freedom
chi_square_value <- 10  # Replace this with your chi-square value

# Ensure the data is in the correct structure
count_distribution <- data.frame(
  lowerz = c(-3, -2, -1, 0, 1, 2),
  upperz = c(-2, -1, 0, 1, 2, 3),
  Expected = c(41.70906, 264.87908, 665.28091, 665.28091, 264.87908, 41.70906),
  Observed = c(33, 202, 812, 617, 220, 53)
)

# Convert count_distribution to long format
count_distribution_long <- count_distribution %>%
  pivot_longer(cols = c("Expected", "Observed"),
               names_to = "Type",
               values_to = "Counts")

# Create the bar chart
ggplot(count_distribution_long, aes(x = factor(lowerz), y = Counts, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Observed vs Expected Counts for Z-Score Ranges",
       x = "Z-Score Range (Lower Bound)",
       y = "Count",
       fill = "Type") +  theme_minimal()+
geom_area(stat="function", fun = dchisq, args=c(df=df_value), 
fill = "blue", 
xlim=c(chi_square_value, 20))

count_distribution_normal <- function(df, num_var, lower_z, upper_z, step_size) {
  # Initialize an empty data frame
  count_distribution <- data.frame(lowerz = numeric(),
                                   upperz = numeric(),
                                   expected = numeric(),
                                   observed = numeric(),
                                   residual = numeric())
  
  # Calculate summary statistics
  summary_stats <- df %>% summarise(df_mean = mean({{num_var}}, na.rm = TRUE), 
                                    df_sd = sd({{num_var}}, na.rm = TRUE))
  
  # Add a z-score column to the data frame
  df_with_z <- df %>% mutate(data_z = ({{num_var}} - summary_stats$df_mean) / summary_stats$df_sd)
  
  # Calculate the number of steps
  K = (upper_z - lower_z) / step_size
  
  # Initialize the first interval
  a = lower_z
  b = lower_z + step_size
  
  # Loop through each interval and compute the counts
  for (i in 1:K) {
    norm_area <- pnorm(b) - pnorm(a)
    expected_count <- norm_area * nrow(df)
    observed_count <- sum(between(df_with_z$data_z, a, b)) - sum(df_with_z$data_z == b)
    residual <- (observed_count - expected_count)^2 / expected_count
    
    # Append the results to the data frame
    count_distribution <- rbind(count_distribution, data.frame(lowerz = a, 
                                                                upperz = b, 
                                                                expected = expected_count, 
                                                                observed = observed_count, 
                                                                residual = residual))
    
    # Shift the interval up by the step size
    a = a + step_size
    b = b + step_size
  }
  
  # Return the final data frame
  return(count_distribution)
}

# Call the function and print the result
result <- count_distribution_normal(feb_sea_weather, TMAX, -3, 3, 1)
print(result)
##   lowerz upperz  expected observed  residual
## 1     -3     -2  41.70906       33  1.818494
## 2     -2     -1 264.87908      202 14.926732
## 3     -1      0 665.28091      812 32.356995
## 4      0      1 665.28091      617  3.503853
## 5      1      2 264.87908      220  7.603968
## 6      2      3  41.70906       53  3.056541
feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,1)
feb_dist
##   lowerz upperz  expected observed  residual
## 1     -3     -2  41.70906       33  1.818494
## 2     -2     -1 264.87908      202 14.926732
## 3     -1      0 665.28091      812 32.356995
## 4      0      1 665.28091      617  3.503853
## 5      1      2 264.87908      220  7.603968
## 6      2      3  41.70906       53  3.056541

#I have the residuals for each iteration of 1 unit between Z=-3 and Z=3 for the TMAX.

count_distribution_normal <- function(df, num_var, lower_z, upper_z, step_size) {
  # Initialize an empty data frame
  count_distribution <- data.frame(lowerz = numeric(),
                                   upperz = numeric(),
                                   expected = numeric(),
                                   observed = numeric(),
                                   residual = numeric())
  
  # Calculate summary statistics
  summary_stats <- df %>% summarise(df_mean = mean({{num_var}}, na.rm = TRUE), 
                                    df_sd = sd({{num_var}}, na.rm = TRUE))
  
  # Add a z-score column to the data frame
  df_with_z <- df %>% mutate(data_z = ({{num_var}} - summary_stats$df_mean) / summary_stats$df_sd)
  
  # Calculate the number of steps
  K = (upper_z - lower_z) / step_size
  
  # Initialize the first interval
  a = lower_z
  b = lower_z + step_size
  
  # Loop through each interval and compute the counts
  for (i in 1:K) {
    norm_area <- pnorm(b) - pnorm(a)
    expected_count <- norm_area * nrow(df)
    observed_count <- sum(between(df_with_z$data_z, a, b)) - sum(df_with_z$data_z == b)
    residual <- (observed_count - expected_count)^2 / expected_count
    
    # Append the results to the data frame
    count_distribution <- rbind(count_distribution, data.frame(lowerz = a, 
                                                                upperz = b, 
                                                                expected = expected_count, 
                                                                observed = observed_count, 
                                                                residual = residual))
    
    # Shift the interval up by the step size
    a = a + step_size
    b = b + step_size
  }
  
  # Return the final data frame
  return(count_distribution)
}

# Call the function and print the result
feb_dist <- count_distribution_normal(feb_sea_weather, TMIN, -3, 3, 1)
print(feb_dist)
##   lowerz upperz  expected observed    residual
## 1     -3     -2  41.70906       41  0.01205398
## 2     -2     -1 264.87908      192 20.05202014
## 3     -1      0 665.28091      641  0.88618595
## 4      0      1 665.28091      726  5.54173106
## 5      1      2 264.87908      322 12.31807040
## 6      2      3  41.70906       13 19.76093376

#I have the residuals for each iteration of 1 unit between Z=-3 and Z=3 for the TMIN

chi_square_bar_chart <- function(dist_table){
count_distribution_long <- dist_table %>% 
pivot_longer(c(expected, observed), 
names_to = "Group", 
values_to = "Counts")

ggplot(count_distribution_long, aes(x = lowerz, y = Counts, fill=Group))+
geom_bar(stat="identity",position="dodge")
}
chi_square_graph_pvalue <- function(dist_table){
    chi_square <- sum(dist_table$residual)
    deg_freedom <- nrow(dist_table) - 1
    p_value <- 1 - pchisq(chi_square, deg_freedom)

ggplot(data.frame(x=c(0, chi_square+20)), aes(x=x)) +
        stat_function(fun=dchisq, args = c(df = deg_freedom)) +
geom_area(stat="function", 
fun = dchisq, 
args=c(df = deg_freedom), 
fill = "blue", 
xlim=c(chi_square, chi_square+20))+
geom_vline(xintercept = chi_square,color="blue") + 
ggtitle(paste("chi-square =", chi_square, "; p-value =", p_value))
}
feb_dist<-count_distribution_normal(feb_sea_weather, TMIN,-3,3,1)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

#This is the chi-square for February TMIN Bar Chart and P-Value. Note the P valueis extremely smaller than Alpha = 0.0001.

feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,1)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

#This is the chi-square for February TMAX bar chart and P-Value. Note the P valueis extremely smaller than Alpha = 0.0001.

feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,0.5)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

# Ran a TMAX with step increments of 0.5 between Z values.

feb_dist<-count_distribution_normal(feb_sea_weather, TMIN,-3,3,0.5)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

# Ran a TMAX with step increments of 0.5 between Z values.

feb_dist<-count_distribution_normal(feb_sea_weather, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

#What do you notice? Does it appear your month’s low temperatures are normally distributed or significantly different from normally distributed? This bar chart compares the observed counts (in blue) of low temperatures within different z-score intervals against the expected counts (in red) if the temperatures followed a normal distribution.The expected counts represent what you would expect if the data were perfectly normally distributed.The observed counts do not perfectly match the expected counts, particularly at the tails and some central intervals.This visual again suggests that the low temperatures are not perfectly normally distributed, as the observed counts show deviations from what is expected under a normal distribution.The chi-square plot shows the chi-square distribution, with the calculated chi-square statistic for the data marked as a vertical blue line. The p-value is also displayed.The chi-square statistic is very high (274.73), and the p-value is 0, which indicates that the observed distribution of low temperatures is significantly different from the expected normal distribution.A p-value of 0 means there is a very strong evidence against the null hypothesis, which in this case would be that the temperatures are normally distributed. Statistically, the low temperatures for the month are significantly different from a normal distribution.

feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

#What do you notice? Does it appear your month’s high temperatures are normally distributed or significantly different from normally distributed? This bar chart compares the observed counts (in blue) of high temperatures within different z-score intervals against the expected counts (in red) if the temperatures followed a normal distribution.The expected counts represent what you would expect if the data were perfectly normally distributed.The observed counts show how the actual data is distributed.There are noticeable discrepancies between the expected and observed counts, especially in the tails of the distribution (e.g., the lower and upper ends of the z-scores) and some differences in the central part of the distribution.This visual suggests that the high temperatures are not perfectly normally distributed, as the observed counts do not consistently match the expected counts across the range of z-scores.The chi-square plot shows the chi-square distribution, with the calculated chi-square statistic for the data marked as a vertical blue line. The p-value is also displayed.The chi-square statistic is very high (273.55), and the p-value is 0, which indicates that the observed distribution of temperatures is significantly different from the expected normal distribution.A p-value of 0 means there is a very strong evidence against the null hypothesis, which in this case would be that the temperatures are normally distributed. Statistically, the high temperatures for the month are significantly different from a normal distribution.

set.seed(0220)
feb_sampled_days <- sea_weather %>%
  filter(month == 2) %>%
  slice_sample(n = 20)

# Display the sampled days
print(feb_sampled_days)
##          DATE PRCP TMAX TMIN year month day
## 1  2007-02-19 0.74   46   41 2007     2  19
## 2  1968-02-11 0.00   54   30 1968     2  11
## 3  1985-02-13 0.00   46   30 1985     2  13
## 4  1950-02-09 0.02   45   37 1950     2   9
## 5  1969-02-08 1.05   48   37 1969     2   8
## 6  1971-02-10 0.31   53   46 1971     2  10
## 7  2010-02-12 0.38   54   43 2010     2  12
## 8  1991-02-04 0.68   58   49 1991     2   4
## 9  2007-02-06 0.03   48   41 2007     2   6
## 10 2016-02-02 0.01   50   35 2016     2   2
## 11 1964-02-17 0.12   46   42 1964     2  17
## 12 1972-02-09 0.00   44   33 1972     2   9
## 13 2012-02-05 0.00   57   35 2012     2   5
## 14 1958-02-17 0.40   57   44 1958     2  17
## 15 1949-02-14 0.18   38   30 1949     2  14
## 16 1978-02-08 0.09   57   43 1978     2   8
## 17 1979-02-25 0.43   58   45 1979     2  25
## 18 1994-02-15 0.65   48   44 1994     2  15
## 19 1993-02-02 0.00   57   38 1993     2   2
## 20 1996-02-13 0.00   59   37 1996     2  13

#Sample of 20 days from February (My Birthday Month)

# Sample of 20 from February with Z increments of 0.25-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

# Sample of 20 from February with Z increments of 0.50-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

# Sample of 20 from February with Z increments of 1-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

#This chart compares the observed counts of high temperatures (in blue) within different z-score intervals against the expected counts (in red) if the temperatures followed a normal distribution.The observed counts show some variation compared to the expected counts. However, the discrepancies are not as pronounced as in the earlier charts.The observed counts generally follow the expected distribution, with some fluctuations.The visual suggests that the high temperatures are fairly close to a normal distribution, though there are some small deviations. The observed counts are not significantly different from the expected counts.This plot shows the chi-square distribution, with the calculated chi-square statistic for your data marked as a vertical blue line. The p-value is also displayed.The chi-square statistic is 22.77, which is relatively low, and the p-value is 0.474. This p-value is high enough to indicate that the differences between the observed and expected distributions are not statistically significant.A p-value above 0.05 generally suggests that you do not have enough evidence to reject the null hypothesis that the data is normally distributed.Statistically, the high temperatures for your birthday month do not appear to be significantly different from a normal distribution. The p-value indicates that the observed distribution could be consistent with a normal distribution.

# Sample of 20 from February with Z increments of 0.25-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

# Sample of 20 from February with Z increments of 0.50-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

# Sample of 20 from February with Z increments of 1-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)

chi_square_graph_pvalue(feb_dist)

#This chart compares the observed counts of low temperatures (in blue) within different z-score intervals against the expected counts (in red) if the temperatures followed a normal distribution.The observed counts show some variation compared to the expected counts. However, the discrepancies are not as pronounced as in the earlier charts.The observed counts generally follow the expected distribution, with some fluctuations.The visual suggests that the low temperatures are fairly close to a normal distribution, though there are some small deviations. The observed counts are not significantly different from the expected counts.This plot shows the chi-square distribution, with the calculated chi-square statistic for your data marked as a vertical blue line. The p-value is also displayed.The chi-square statistic is 22.77, which is relatively low, and the p-value is 0.474. This p-value is high enough to indicate that the differences between the observed and expected distributions are not statistically significant.A p-value above 0.05 generally suggests that you do not have enough evidence to reject the null hypothesis that the data is normally distributed.Statistically, the low temperatures for your birthday month do not appear to be significantly different from a normal distribution. The p-value indicates that the observed distribution could be consistent with a normal distribution.

library(tidytuesdayR)
library(readr)

# Load the data
ram <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-09-03/ram.csv", show_col_types = FALSE)

# Assign to a data frame
ram_df <- ram

# View the first few rows of the data frame
print(head(ram_df))
## # A tibble: 6 × 10
##   chip_name capacity_bits bit_units ram_type       transistor_count
##   <chr>             <dbl> <chr>     <chr>                     <dbl>
## 1 N/A                   1 Bits      SRAM (cell)                   6
## 2 N/A                   1 Bits      DRAM (cell)                   1
## 3 ?                     8 Bits      SRAM (bipolar)               48
## 4 SP95                 16 Bits      SRAM (bipolar)               80
## 5 TMC3162              16 Bits      SRAM (TTL)                   96
## 6 ?                    NA <NA>      SRAM (MOS)                   NA
## # ℹ 5 more variables: date_of_introduction <dbl>, manufacturer_s <chr>,
## #   process <dbl>, area <dbl>, ref <chr>
glimpse(ram_df)
## Rows: 47
## Columns: 10
## $ chip_name            <chr> "N/A", "N/A", "?", "SP95", "TMC3162", "?", "?", "…
## $ capacity_bits        <dbl> 1, 1, 8, 16, 16, NA, 256, 64, 144, 256, 1, 1, 1, …
## $ bit_units            <chr> "Bits", "Bits", "Bits", "Bits", "Bits", NA, "Bits…
## $ ram_type             <chr> "SRAM (cell)", "DRAM (cell)", "SRAM (bipolar)", "…
## $ transistor_count     <dbl> 6, 1, 48, 80, 96, NA, 256, 384, 864, 1536, 768, 3…
## $ date_of_introduction <dbl> 1963, 1965, 1965, 1965, 1966, 1966, 1968, 1968, 1…
## $ manufacturer_s       <chr> "Fairchild", "Toshiba", "SDS, Signetics", "IBM", …
## $ process              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 12000, NA, 80…
## $ area                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, N…
## $ ref                  <chr> "[162]", "[163][164]", "[162]", "[165]", "[160]",…
library(GGally)
#create a vector of a group of numerical variables
num_vars <- c("capacity_bits", "transistor_count", "date_of_introduction", "area")

#pipe your data frame into select() to keep only the numerical variables of interest
#then pipe those variables into ggpairs() to plot scatterplots.
ram_df%>% select(all_of(num_vars)) %>% ggpairs()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 38 rows containing missing values
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 38 rows containing missing values
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 38 rows containing missing values
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_density()`).

ram_df <- ram_df %>% mutate("log(capacity_bits)" = log(capacity_bits))

#One variable seems to have a non-linear relationship with the other variables, we might be able to mutate it into a re-expressed format.I used R with the mutate() function. This creates a new column in my data frame that stores the logarithm of the numerical variable. If I reexpress one of your variables, I will be able to use that re-expressed variable.

ggplot(ram_df, aes(x = area, y=capacity_bits)) + 
    geom_point() + 
    stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).

model <- lm(capacity_bits ~ area, ram_df) 
summary(model)
## 
## Call:
## lm(formula = capacity_bits ~ area, data = ram_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -113.40 -104.08  -46.06  148.54  177.94 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  117.285     60.775   1.930   0.0949 .
## area          -0.289      1.094  -0.264   0.7992  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130.9 on 7 degrees of freedom
##   (38 observations deleted due to missingness)
## Multiple R-squared:  0.009873,   Adjusted R-squared:  -0.1316 
## F-statistic: 0.0698 on 1 and 7 DF,  p-value: 0.7992

#Plot of the scatterplot, for the correlation between the number of transistors in the chip and the year of introduction . The slope of this line of best fit represents the area units change for every unit increase of area

# Extract the slope (coefficient of x_var)
slope <- coef(model)[2]
cat("Slope:", slope, "\n")
## Slope: -0.2889913
# Calculate the correlation coefficient
correlation <- cor(ram_df$area, ram_df$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: NA
# Plot the data with the regression line
ggplot(ram_df, aes(x = area, y = capacity_bits)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatterplot with Regression Line",
       x = "Area of Chip in Square Millimeters",
       y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Predicted values (y_hat) from the model
ram_df$y_hat <- predict(model, newdata = ram_df)

# View the first few predicted values
head(ram_df$y_hat)
##  1  2  3  4  5  6 
## NA NA NA NA NA NA

#Slope is -0.2889913. This means that for every 1 millimeter squared area for the size of chip increase there is a drop of -0.2889913 capacity of bits, I will filter out 25 millimeters squared as an outlier.

# Filter out rows where the year is 25 millimeters squared
ram_df_filtered <- ram_df %>%
  filter(area != 25)

# Extract the slope (coefficient of x_var)
new_slope <- coef(model)[2]
cat("Slope:", new_slope, "\n")
## Slope: -0.2889913
# Calculate the correlation coefficient
correlation <- cor(ram_df_filtered$area, ram_df_filtered$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: -0.04952564
# Plot the data with the regression line
ggplot(ram_df_filtered, aes(x = area, y = capacity_bits)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatterplot with Regression Line-Outlier Removed",
       x = "Area of Chip in Square Millimeters",
       y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'

# Predicted values (y_hat) from the model
ram_df_filtered$y_hat <- predict(model, newdata = ram_df_filtered)

# View the first few predicted values
head(ram_df_filtered$y_hat)
##        1        2        3        4        5        6 
## 114.3955 113.8175 112.0836 107.1707 105.4368 107.4597

#Outlier at 25 square millimeters essentially lowered the slope by half there fore was as influential as I thought. However I was able to get a correlation coefficient of 0.049 indicating almost no correlation between the physical area of the RAM chip and the bit capacity.

ram_sample_1 <- ram_df %>% slice_sample(n = 20)
ggplot(ram_sample_1, aes(x = area, y=capacity_bits)) + 
    geom_point() + 
    stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

model <- lm(capacity_bits ~ area, ram_sample_1) 
summary(model)
## 
## Call:
## lm(formula = capacity_bits ~ area, data = ram_sample_1)
## 
## Residuals:
##       5       7       8 
##  -4.416 -97.156 101.572 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -74.318    221.910  -0.335    0.794
## area           6.728      7.649   0.880    0.541
## 
## Residual standard error: 140.6 on 1 degrees of freedom
##   (17 observations deleted due to missingness)
## Multiple R-squared:  0.4362, Adjusted R-squared:  -0.1276 
## F-statistic: 0.7736 on 1 and 1 DF,  p-value: 0.5407
view(ram_sample_1)

#The sample of 20 slope

ram_sample_1 <- ram_sample_1 %>%
  filter(area != 25)

# Extract the slope (coefficient of x_var)
slope_new2 <- coef(model)[2]
cat("Slope:", slope_new2, "\n")
## Slope: 6.727811
# Calculate the correlation coefficient
correlation <- cor(ram_sample_1$area, ram_sample_1$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: 0.660443
# Plot the data with the regression line
ggplot(ram_df_filtered, aes(x = area, y = capacity_bits)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatterplot with Regression Line-Outlier Removed",
       x = "Area of Chip in Square Millimeters",
       y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'

# Predicted values (y_hat) from the model
ram_sample_1$y_hat <- predict(model, newdata = ram_sample_1)

# View the first few predicted values
head(ram_sample_1$y_hat)
##          1          2          3 
##   6.416174 161.155819 154.428008
ram_sample_2 <- ram_df %>% slice_sample(n = 10)
ggplot(ram_sample_2, aes(x = area, y=capacity_bits)) + 
    geom_point() + 
    stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

model <- lm(capacity_bits ~ area, ram_sample_1) 
summary(model)
## 
## Call:
## lm(formula = capacity_bits ~ area, data = ram_sample_1)
## 
## Residuals:
##       1       2       3 
##  -4.416 -97.156 101.572 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -74.318    221.910  -0.335    0.794
## area           6.728      7.649   0.880    0.541
## 
## Residual standard error: 140.6 on 1 degrees of freedom
## Multiple R-squared:  0.4362, Adjusted R-squared:  -0.1276 
## F-statistic: 0.7736 on 1 and 1 DF,  p-value: 0.5407
view(ram_sample_2)

#The second sample. n=10 `

ram_sample_2 <- ram_sample_2 %>%
  filter(area != 25)

# Extract the slope (coefficient of x_var)
slope_new3 <- coef(model)[2]
cat("Slope:", slope_new3, "\n")
## Slope: 6.727811
# Calculate the correlation coefficient
correlation <- cor(ram_sample_2$area, ram_sample_2$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: 1
# Plot the data with the regression line
ggplot(ram_df_filtered, aes(x = area, y = capacity_bits)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatterplot with Regression Line-Outlier Removed",
       x = "Area of Chip in Square Millimeters",
       y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'

# Predicted values (y_hat) from the model
ram_sample_2$y_hat <- predict(model, newdata = ram_sample_2)

# View the first few predicted values
head(ram_sample_2$y_hat)
##          1          2 
##   6.416174 161.155819

#When n=10 the slope was less reliable. The Pearson r was na

ram_sample_3 <- ram_df %>% slice_sample(n = 50)
ggplot(ram_sample_3, aes(x = area, y=capacity_bits)) + 
    geom_point() + 
    stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).

model <- lm(capacity_bits ~ area, ram_sample_3) 
summary(model)
## 
## Call:
## lm(formula = capacity_bits ~ area, data = ram_sample_3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -113.40 -104.08  -46.06  148.54  177.94 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  117.285     60.775   1.930   0.0949 .
## area          -0.289      1.094  -0.264   0.7992  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130.9 on 7 degrees of freedom
##   (38 observations deleted due to missingness)
## Multiple R-squared:  0.009873,   Adjusted R-squared:  -0.1316 
## F-statistic: 0.0698 on 1 and 7 DF,  p-value: 0.7992
view(ram_sample_3)

ram_sample_3 <- ram_sample_3 %>%
  filter(area != 25)

# Extract the slope (coefficient of x_var)
slope_new4 <- coef(model)[2]
cat("Slope:", slope_new4, "\n")
## Slope: -0.2889913
# Calculate the correlation coefficient
correlation <- cor(ram_sample_3$area, ram_sample_3$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: -0.04952564
# Plot the data with the regression line
ggplot(ram_sample_3, aes(x = area, y = capacity_bits)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatterplot with Regression Line-Outlier Removed",
       x = "Area of Chip in Square Millimeters",
       y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'

# Predicted values (y_hat) from the model
ram_sample_3$y_hat <- predict(model, newdata = ram_sample_3)

# View the first few predicted values
head(ram_sample_3$y_hat)
##         1         2         3         4         5         6 
## 105.43680 113.81754 112.08360  74.51473 107.45974 114.39553

#The slope and correlation reading is the same with 50 as it was with 20

rep_regression_samples <- function(ram_df, num_var1, num_var2, N, T) {
  
  # Initialize an empty data frame and title the columns)
  
  rep_samples <- data.frame(matrix(ncol=4, nrow = 1))
  colnames(rep_samples) <- c("sample_slope", "slope_t", "sample_corr", "corr_t")
  
  #creates a data frame internal to the function to help with variable titles
  
  tmp <- ram_df %>% select({{num_var1}}, {{num_var2}})
  colnames(tmp) <- c("x", "y")
  
  # Repeatedly draw samples from your population of size N.  
  # Compute the slope and correlation as well as the t-scores of each
  # Then add the results to the data frame
  
  for (i in 1:T) {
    sample <- tmp %>% slice_sample(n = N)
    model <- lm(y ~ x, sample)
    sample_slope <- model$coefficients[2]
    slope_SE <- sqrt( (sum( (model$residuals)^2) ) / (N-2)) * (1/(sd(sample$x)*sqrt(N - 1)))
    slope_t <- sample_slope / slope_SE
    sample_corr <- cor(sample$x, sample$y)
    corr_t <- sample_corr*sqrt( (N-2) / (1-sample_corr^2) )
    rep_samples[i,] <- c(sample_slope, slope_t, sample_corr, corr_t)
  }
  
  # sets the output of the function to be the data frame generated
  
  rep_samples
}
my_rep_samples <- rep_regression_samples(ram_df, area, capacity_bits, 20, 100)
print(head(my_rep_samples))
##   sample_slope slope_t sample_corr corr_t
## 1   -0.4461785      NA          NA     NA
## 2   -0.1394102      NA          NA     NA
## 3   11.0056391      NA          NA     NA
## 4   11.1627907      NA          NA     NA
## 5   -0.2619231      NA          NA     NA
## 6   -0.2594152      NA          NA     NA
# Plot a histogram of the sample_slope column
library(ggplot2)

ggplot(my_rep_samples, aes(x = sample_slope)) +
  geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +
  labs(title = "Histogram of Sample Slopes",
       x = "Sample Slope",
       y = "Frequency")
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Choose an alpha level
alpha <- 0.05

# Calculate the degrees of freedom
# Assuming you have n observations in each sample
n <- length(my_rep_samples$sample_slope)  # or use the specific sample size
deg_freedom <- n - 2

# Find the critical t value
critical_t <- qt(alpha / 2, deg_freedom, lower.tail = FALSE)
critical_t
## [1] 1.984467
# Calculate the proportion of samples where the absolute t-value exceeds the critical t-value
reject_null_proportion <- mean(abs(my_rep_samples$slope_t) > critical_t)

# Print the result
cat("Proportion of samples rejecting the null hypothesis:", reject_null_proportion)
## Proportion of samples rejecting the null hypothesis: NA

#I noticed that the slopes were mostly denser arounf 0 but there were wide spead of outliers indicsting the data is not normally distributed at alpha 0.05 ans unable to say what proportion were rejecting the null (na). When increasing alpha to 0.10 the conclusion was essentially the same.

# Plot a histogram of the sample_slope column
library(ggplot2)

ggplot(my_rep_samples, aes(x = sample_slope)) +
  geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +
  labs(title = "Histogram of Sample Slopes",
       x = "Sample Slope",
       y = "Frequency")
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Choose an alpha level
alpha <- 0.10

# Calculate the degrees of freedom
# Assuming you have n observations in each sample
n <- length(my_rep_samples$sample_slope)  # or use the specific sample size
deg_freedom <- n - 2

# Find the critical t value
critical_t <- qt(alpha / 2, deg_freedom, lower.tail = FALSE)
critical_t
## [1] 1.660551
# Calculate the proportion of samples where the absolute t-value exceeds the critical t-value
reject_null_proportion <- mean(abs(my_rep_samples$slope_t) > critical_t)

# Print the result
cat("Proportion of samples rejecting the null hypothesis:", reject_null_proportion)
## Proportion of samples rejecting the null hypothesis: NA
# Check column names
names(ram_df)
##  [1] "chip_name"            "capacity_bits"        "bit_units"           
##  [4] "ram_type"             "transistor_count"     "date_of_introduction"
##  [7] "manufacturer_s"       "process"              "area"                
## [10] "ref"                  "log(capacity_bits)"   "y_hat"
# Check the structure of the data frame
str(ram_df)
## tibble [47 × 12] (S3: tbl_df/tbl/data.frame)
##  $ chip_name           : chr [1:47] "N/A" "N/A" "?" "SP95" ...
##  $ capacity_bits       : num [1:47] 1 1 8 16 16 NA 256 64 144 256 ...
##  $ bit_units           : chr [1:47] "Bits" "Bits" "Bits" "Bits" ...
##  $ ram_type            : chr [1:47] "SRAM (cell)" "DRAM (cell)" "SRAM (bipolar)" "SRAM (bipolar)" ...
##  $ transistor_count    : num [1:47] 6 1 48 80 96 ...
##  $ date_of_introduction: num [1:47] 1963 1965 1965 1965 1966 ...
##  $ manufacturer_s      : chr [1:47] "Fairchild" "Toshiba" "SDS, Signetics" "IBM" ...
##  $ process             : num [1:47] NA NA NA NA NA NA NA NA NA 12000 ...
##  $ area                : num [1:47] NA NA NA NA NA NA NA NA NA NA ...
##  $ ref                 : chr [1:47] "[162]" "[163][164]" "[162]" "[165]" ...
##  $ log(capacity_bits)  : num [1:47] 0 0 2.08 2.77 2.77 ...
##  $ y_hat               : Named num [1:47] NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:47] "1" "2" "3" "4" ...
# Create a binary indicator variable for Hitachi
ram_df$Hitachi <- ifelse(ram_df$manufacturer_s == "Hitachi", 1, 0)

# Now fit the model
model <- lm(capacity_bits ~ Hitachi, data = ram_df)
summary(model)
## 
## Call:
## lm(formula = capacity_bits ~ Hitachi, data = ram_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.727 -58.727 -46.727   1.273 225.273 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    62.73      13.87   4.524 4.56e-05 ***
## Hitachi       -58.23      66.50  -0.876    0.386    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 91.97 on 44 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01713,    Adjusted R-squared:  -0.005211 
## F-statistic: 0.7667 on 1 and 44 DF,  p-value: 0.386

#Looking for Hitachi and modeled with capacity_bits # In the coefficients section of the summary, you are given one row for (Intercept) and another row for one of your categorical values, whichever one is second in alphabetical order. The categorical values. In my case Hitachi. The equation is yhat=62.75-58.23x at yhat(1) = 62.75-58.23=4.52. he coefficient -58.23 is a measurement of how yhat changes when the indicated condition becomes true compared to the other condition.

# Subset the data to include only the selected manufacturers
subset_df <- ram_df %>%
  filter(manufacturer_s %in% c("Hitachi", "IBM", "Toshiba"))

# View the first few rows of the subset data frame
head(subset_df)
## # A tibble: 6 × 13
##   chip_name capacity_bits bit_units ram_type       transistor_count
##   <chr>             <dbl> <chr>     <chr>                     <dbl>
## 1 N/A                   1 Bits      DRAM (cell)                   1
## 2 SP95                 16 Bits      SRAM (bipolar)               80
## 3 ?                     8 kb        DRAM (PMOS)                8192
## 4 ?                     4 kb        SRAM (CMOS)               24576
## 5 ?                   288 kb        DRAM                     294912
## 6 ?                   256 kb        SRAM (CMOS)             1572864
## # ℹ 8 more variables: date_of_introduction <dbl>, manufacturer_s <chr>,
## #   process <dbl>, area <dbl>, ref <chr>, `log(capacity_bits)` <dbl>,
## #   y_hat <dbl>, Hitachi <dbl>

#Correlating Hitachi versus Toshiba and IBM

# Fit the linear model using the new variable
model <- lm(capacity_bits ~ manufacturer_s, data = ram_df)

# Print a summary of the linear model
summary(model)
## 
## Call:
## lm(formula = capacity_bits ~ manufacturer_s, data = ram_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -106.000  -48.000    0.000    1.125  207.000 
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                              107.00      60.90
## manufacturer_sFujitsu, NEC                               -91.00     121.79
## manufacturer_sGeneral Instrument                        -105.00     121.79
## manufacturer_sHitachi                                   -102.50      96.28
## manufacturer_sHitachi, NEC                               149.00     121.79
## manufacturer_sHitachi, Toshiba                           -91.00     121.79
## manufacturer_sHyundai                                   -104.50      96.28
## manufacturer_sIBM                                         -3.00      86.12
## manufacturer_sIntel                                      -58.00      72.78
## manufacturer_sIntel, Honeywell                          -106.00     121.79
## manufacturer_sMatsushita                                 -43.00     121.79
## manufacturer_sMatsushita, Mitsubishi, Fujitsu, Toshiba   -43.00     121.79
## manufacturer_sMitsubishi                                -106.00     121.79
## manufacturer_sNEC                                        -25.80      77.03
## manufacturer_sNEC, Toshiba, Hitachi, Mitsubishi         -103.00     121.79
## manufacturer_sNTT                                          5.00      86.12
## manufacturer_sSamsung                                    -63.00      74.58
## manufacturer_sSDS, Signetics                             -99.00     121.79
## manufacturer_sSiemens                                    -43.00     121.79
## manufacturer_sToshiba                                    -20.00      86.12
## manufacturer_sTransitron                                 -91.00     121.79
##                                                        t value Pr(>|t|)  
## (Intercept)                                              1.757   0.0911 .
## manufacturer_sFujitsu, NEC                              -0.747   0.4619  
## manufacturer_sGeneral Instrument                        -0.862   0.3968  
## manufacturer_sHitachi                                   -1.065   0.2973  
## manufacturer_sHitachi, NEC                               1.223   0.2326  
## manufacturer_sHitachi, Toshiba                          -0.747   0.4619  
## manufacturer_sHyundai                                   -1.085   0.2881  
## manufacturer_sIBM                                       -0.035   0.9725  
## manufacturer_sIntel                                     -0.797   0.4330  
## manufacturer_sIntel, Honeywell                          -0.870   0.3924  
## manufacturer_sMatsushita                                -0.353   0.7270  
## manufacturer_sMatsushita, Mitsubishi, Fujitsu, Toshiba  -0.353   0.7270  
## manufacturer_sMitsubishi                                -0.870   0.3924  
## manufacturer_sNEC                                       -0.335   0.7405  
## manufacturer_sNEC, Toshiba, Hitachi, Mitsubishi         -0.846   0.4057  
## manufacturer_sNTT                                        0.058   0.9542  
## manufacturer_sSamsung                                   -0.845   0.4063  
## manufacturer_sSDS, Signetics                            -0.813   0.4240  
## manufacturer_sSiemens                                   -0.353   0.7270  
## manufacturer_sToshiba                                   -0.232   0.8182  
## manufacturer_sTransitron                                -0.747   0.4619  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105.5 on 25 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2656, Adjusted R-squared:  -0.322 
## F-statistic: 0.452 on 20 and 25 DF,  p-value: 0.963

#My yhat=4.50 + 99.5x_1 +82.5x_2 This gives the expected estimate when correlating IBM and Toshiba verus Hitachi

# Choose three of the numerical variables from your data set, and create a linear model using these three variables.
model_3_var <- lm(capacity_bits ~ area + transistor_count, ram_df)
summary(model_3_var)
## 
## Call:
## lm(formula = capacity_bits ~ area + transistor_count, data = ram_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -134.827  -56.409   -5.762   22.651  182.208 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      -7.146e+01  9.711e+01  -0.736   0.4895  
## area              7.842e+00  3.729e+00   2.103   0.0802 .
## transistor_count -6.373e-05  2.842e-05  -2.242   0.0661 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104.3 on 6 degrees of freedom
##   (38 observations deleted due to missingness)
## Multiple R-squared:  0.4613, Adjusted R-squared:  0.2817 
## F-statistic: 2.569 on 2 and 6 DF,  p-value: 0.1563

#This gives us the intercept and the slope for this multiple regression model #The first slope is saying that for 1 unit increase in chip area (mm squared), the bit capacity will respond with a decrease of 0.7146 bits (-0.7146 bits). The second slope is saying that for every one unit increase in transistor count, the response will be a decrease of 7.82 bit units. Since the p-value for each of the explanatory variables (transistor count and chip area) is less greater than 0.05 alpha, we do not have enough to declare that there is a significant impact. Both area and transistor_count show marginal significance individually (p-values slightly above 0.05), indicating a potential but not strong relationship with capacity_bits.The overall model is not statistically significant (p = 0.1563), suggesting that the explanatory variables, when considered together, do not have a strong relationship with capacity_bits.

model_sub<-lm(area ~ capacity_bits + date_of_introduction+ram_type, ram_df)
summary(model_sub)
## 
## Call:
## lm(formula = area ~ capacity_bits + date_of_introduction + ram_type, 
##     data = ram_df)
## 
## Residuals:
##         12         14         16         21         22         24         25 
##  3.003e-01 -4.554e-01  1.551e-01  3.282e-03 -6.661e-16  3.498e+00 -3.502e+00 
##         27         31 
##  8.743e-16  2.776e-17 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          -5.486e+03  3.352e+03  -1.637   0.2434  
## capacity_bits        -3.049e-02  3.531e-02  -0.863   0.4789  
## date_of_introduction  2.786e+00  1.696e+00   1.643   0.2421  
## ram_typeDRAM (CMOS)   9.799e+01  1.927e+01   5.084   0.0366 *
## ram_typeDRAM (NMOS)   1.431e+01  4.669e+00   3.065   0.0920 .
## ram_typeDRAM (PMOS)   6.598e+00  1.149e+01   0.574   0.6239  
## ram_typeDRAM (VMOS)  -1.257e+00  7.582e+00  -0.166   0.8836  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.523 on 2 degrees of freedom
##   (38 observations deleted due to missingness)
## Multiple R-squared:  0.9983, Adjusted R-squared:  0.9931 
## F-statistic: 192.1 on 6 and 2 DF,  p-value: 0.005188

#Model with 4 numerical and 2 categorical variables.The final regression equation that comes from the reduction process is area=−5486+(−0.03049×capacity_bits)+(2.786×date_of_introduction)+(97.99×ram_typeDRAM (CMOS))+(14.31×ram_typeDRAM (NMOS))+(6.598×ram_typeDRAM (PMOS))+(−1.257×ram_typeDRAM (VMOS)). Note that yhat is the response variable “area”.The ram_type variable for “DRAM (CMOS)” is significantly associated with the area, suggesting that this type of RAM has a considerable positive impact on the area.The ram_type variable for “DRAM (NMOS)” has a marginally significant effect, while other ram_type categories do not show a significant relationship with area, while capacity_bits and date_of_introduction do not have significant relationships with area based on the p-values provided, meaning that changes in these variables do not predict area reliably within this model.This model shows that certain types of RAM (DRAM (CMOS) and, to a lesser extent, DRAM (NMOS)) significantly affect the area, while the impact of capacity_bits and date_of_introduction is less clear.