library(tidyverse)
library(openintro)
library(broom)
library(GGally)
sea_weather<-read.csv("https://raw.githubusercontent.com/trevorpelletier/2020Spring/master/Seattle_weather.csv")
glimpse(sea_weather)
## Rows: 25,202
## Columns: 7
## $ DATE <chr> "1949-01-01", "1949-01-02", "1949-01-03", "1949-01-04", "1949-01…
## $ PRCP <dbl> 0.00, 0.03, 0.00, 0.00, 0.00, 0.03, 0.52, 0.00, 0.00, 0.00, 0.00…
## $ TMAX <int> 35, 40, 40, 42, 41, 49, 46, 30, 32, 30, 37, 36, 41, 36, 46, 43, …
## $ TMIN <int> 26, 22, 17, 20, 28, 37, 26, 18, 13, 12, 17, 19, 19, 25, 28, 26, …
## $ year <int> 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
##Filter our my birthmonth
# Then filter for February
feb_sea_weather <- sea_weather %>%
filter(month(DATE) == 2)
# Check the output
glimpse(feb_sea_weather)
## Rows: 1,949
## Columns: 7
## $ DATE <chr> "1949-02-01", "1949-02-02", "1949-02-03", "1949-02-04", "1949-02…
## $ PRCP <dbl> 0.00, 0.10, 0.03, 0.11, 0.06, 0.06, 0.00, 0.00, 0.43, 0.96, 0.02…
## $ TMAX <int> 37, 33, 35, 33, 35, 37, 38, 41, 43, 43, 37, 32, 34, 38, 42, 48, …
## $ TMIN <int> 23, 25, 27, 24, 21, 31, 31, 33, 34, 30, 27, 21, 18, 30, 33, 41, …
## $ year <int> 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949…
## $ month <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
#Store February means and standard deviations
feb_high_mean <- mean(feb_sea_weather$TMAX,na.rm=TRUE)
feb_high_sd <- sd(feb_sea_weather$TMAX,na.rm=TRUE)
feb_low_mean <- mean(feb_sea_weather$TMIN,na.rm=TRUE)
feb_low_sd <- sd(feb_sea_weather$TMIN,na.rm=TRUE)
feb_high_mean
## [1] 49.05182
feb_high_sd
## [1] 6.208218
ggplot(feb_sea_weather,aes(x=TMAX))+
geom_histogram(aes(y=..density..),bins = 12,boundary=feb_high_mean)+
stat_function(fun = dnorm, args=c(feb_high_mean,feb_high_sd))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# When I use bins = 12 I can see a very good normal distribution
mean=49.05182 and sd=6.208218 for TMAX in February.
# Compute the mean and standard deviation for TMAX in February
feb_high_mean <- mean(feb_sea_weather$TMAX, na.rm = TRUE)
feb_high_sd <- sd(feb_sea_weather$TMAX, na.rm = TRUE)
# Add a new column TMAX_z with the z-scores
feb_sea_weather <- feb_sea_weather %>%
mutate(TMAX_z = (TMAX - feb_high_mean) / feb_high_sd)
# View the first few rows of the updated dataset
head(feb_sea_weather)
## DATE PRCP TMAX TMIN year month day TMAX_z
## 1 1949-02-01 0.00 37 23 1949 2 1 -1.941269
## 2 1949-02-02 0.10 33 25 1949 2 2 -2.585576
## 3 1949-02-03 0.03 35 27 1949 2 3 -2.263423
## 4 1949-02-04 0.11 33 24 1949 2 4 -2.585576
## 5 1949-02-05 0.06 35 21 1949 2 5 -2.263423
## 6 1949-02-06 0.06 37 31 1949 2 6 -1.941269
sum_TMAX_z<-sum(feb_sea_weather$TMAX_z,na.rm=TRUE)
sum_TMAX_z
## [1] 1.950211e-13
#1.950211e-13 is not exactly 0 due to rounding errors but computationally can be treated as 0 since it is so very very small.
ggplot(feb_sea_weather,aes(x=TMAX_z))+
geom_histogram(aes(y=..density..),bins = 12,boundary=0, fill="blue",alpha=0.6)+
stat_function(fun = dnorm, args=c(0,1),color="red", size=1)+ labs(title="Density Histogram of TMAX Z-Scores with Standard Normal Distribution",x="Z-Score", y="Density")+theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Both the TMAX histogram and TMAX_z histograms are normally distributed
into symmetric bell curves. The shapes are identical. This is because
the spread of TMAX is normally distributed around the mean. In the
Standard normal curve the mean corresponds to a Z of 0
my_count<-nrow(feb_sea_weather)
count_distribution<-data.frame(matrix(nrow=1, ncol = 4))
colnames(count_distribution) <- c("lowerz","upperz","expected","observed")
#Initialized an empty data frame to keep track of the range of z-scores (which is serving as our categories), the expected and observed counts in each category.
lowerz = -3
upperz = -2
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
## lowerz upperz expected observed
## 1 -3 -2 41.70906 33
#The strategy heree is to think about several ranges of z-scores in step size 1, we will find out how many data points we expect to fall into each range based on area computations, and then we’ll find out how many data points actually do fall into each range based on the original data set. We’d like to track these intervals from z = -3 to z =3. The Code above is to compute the Expected number of data points in the z-interval [-3,-2) using area computations, and also the Observed number of data points in this interval. Then, add this information to our empty data frame.
lowerz = -2
upperz = -1
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
## lowerz upperz expected observed
## 1 -2 -1 264.8791 202
lowerz = -1
upperz = 0
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
## lowerz upperz expected observed
## 1 -1 0 665.2809 812
lowerz = 0
upperz =1
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
## lowerz upperz expected observed
## 1 0 1 665.2809 617
lowerz = 1
upperz = 2
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
## lowerz upperz expected observed
## 1 1 2 264.8791 220
lowerz = 2
upperz = 3
index = 1
# Calculate the expected count range
E = (pnorm(upperz)-pnorm(lowerz))*my_count
#calculate the observed count for the range
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
#Store the results in a dataframe
count_distribution[index,] = c(lowerz, upperz, E, O)
print(count_distribution)
## lowerz upperz expected observed
## 1 2 3 41.70906 53
count_distribution_long <- count_distribution %>%
pivot_longer(c(expected, observed), names_to = "Group", values_to = "Counts")
print(count_distribution_long)
## # A tibble: 2 × 4
## lowerz upperz Group Counts
## <dbl> <dbl> <chr> <dbl>
## 1 2 3 expected 41.7
## 2 2 3 observed 53
ggplot(count_distribution_long, aes(x = lowerz, y = Counts, fill=Group))+
geom_bar(stat="identity",position="dodge")
# in the Z =[2,3) The count is higher in the observed group when
compared to the expected.
# First, compute my_count as the total number of rows in feb_sea_weather
my_count <- nrow(feb_sea_weather)
# Initialize an empty data frame to keep track of the range of z-scores
count_distribution <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(count_distribution) <- c("lowerz", "upperz", "Expected", "Observed")
# Loop over the range of z-scores from -3 to 2 (since upperz = lowerz + 1)
for (lowerz in seq(-3, 2, by = 1)) {
upperz = lowerz + 1
E = (pnorm(upperz) - pnorm(lowerz)) * my_count
O = sum(between(feb_sea_weather$TMAX_z, lowerz, upperz)) - sum(feb_sea_weather$TMAX_z == upperz)
count_distribution <- rbind(count_distribution, data.frame(lowerz, upperz, Expected = E, Observed = O))
}
# Print the result
print(count_distribution)
## lowerz upperz Expected Observed
## 1 -3 -2 41.70906 33
## 2 -2 -1 264.87908 202
## 3 -1 0 665.28091 812
## 4 0 1 665.28091 617
## 5 1 2 264.87908 220
## 6 2 3 41.70906 53
#Modified R code so that I can see the Expected and Observed between Z -3 and 3 in 1 unit difference intervals
library(ggplot2)
library(tidyr)
library(dplyr)
# Ensure the data is in the correct structure
count_distribution <- data.frame(
lowerz = c(-3, -2, -1, 0, 1, 2),
upperz = c(-2, -1, 0, 1, 2, 3),
Expected = c(41.70906, 264.87908, 665.28091, 665.28091, 264.87908, 41.70906),
Observed = c(33, 202, 812, 617, 220, 53)
)
# Convert count_distribution to long format
count_distribution_long <- count_distribution %>%
pivot_longer(cols = c("Expected", "Observed"),
names_to = "Type",
values_to = "Counts")
# Create the bar chart
ggplot(count_distribution_long, aes(x = factor(lowerz), y = Counts, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Observed vs Expected Counts for Z-Score Ranges",
x = "Z-Score Range (Lower Bound)",
y = "Count",
fill = "Type") +
theme_minimal()
# Observed versus expected groups for each unit Z interval between -3
and 3. I noticed that the expected values were greater than observed in
each Z interval except for the Z=-1 to 0 interval and slightly more in
the 2 to 3 interval. This is saying that our expected regression is
doing a fairly good job through out the spectrum of Z values. The large
difference in the center compensates for the lesser residuals in the
other z intervals
library(ggplot2)
library(tidyr)
library(dplyr)
df_value <- 5 # Replace this with your degrees of freedom
chi_square_value <- 10 # Replace this with your chi-square value
# Ensure the data is in the correct structure
count_distribution <- data.frame(
lowerz = c(-3, -2, -1, 0, 1, 2),
upperz = c(-2, -1, 0, 1, 2, 3),
Expected = c(41.70906, 264.87908, 665.28091, 665.28091, 264.87908, 41.70906),
Observed = c(33, 202, 812, 617, 220, 53)
)
# Convert count_distribution to long format
count_distribution_long <- count_distribution %>%
pivot_longer(cols = c("Expected", "Observed"),
names_to = "Type",
values_to = "Counts")
# Create the bar chart
ggplot(count_distribution_long, aes(x = factor(lowerz), y = Counts, fill = Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Observed vs Expected Counts for Z-Score Ranges",
x = "Z-Score Range (Lower Bound)",
y = "Count",
fill = "Type") + theme_minimal()+
geom_area(stat="function", fun = dchisq, args=c(df=df_value),
fill = "blue",
xlim=c(chi_square_value, 20))
count_distribution_normal <- function(df, num_var, lower_z, upper_z, step_size) {
# Initialize an empty data frame
count_distribution <- data.frame(lowerz = numeric(),
upperz = numeric(),
expected = numeric(),
observed = numeric(),
residual = numeric())
# Calculate summary statistics
summary_stats <- df %>% summarise(df_mean = mean({{num_var}}, na.rm = TRUE),
df_sd = sd({{num_var}}, na.rm = TRUE))
# Add a z-score column to the data frame
df_with_z <- df %>% mutate(data_z = ({{num_var}} - summary_stats$df_mean) / summary_stats$df_sd)
# Calculate the number of steps
K = (upper_z - lower_z) / step_size
# Initialize the first interval
a = lower_z
b = lower_z + step_size
# Loop through each interval and compute the counts
for (i in 1:K) {
norm_area <- pnorm(b) - pnorm(a)
expected_count <- norm_area * nrow(df)
observed_count <- sum(between(df_with_z$data_z, a, b)) - sum(df_with_z$data_z == b)
residual <- (observed_count - expected_count)^2 / expected_count
# Append the results to the data frame
count_distribution <- rbind(count_distribution, data.frame(lowerz = a,
upperz = b,
expected = expected_count,
observed = observed_count,
residual = residual))
# Shift the interval up by the step size
a = a + step_size
b = b + step_size
}
# Return the final data frame
return(count_distribution)
}
# Call the function and print the result
result <- count_distribution_normal(feb_sea_weather, TMAX, -3, 3, 1)
print(result)
## lowerz upperz expected observed residual
## 1 -3 -2 41.70906 33 1.818494
## 2 -2 -1 264.87908 202 14.926732
## 3 -1 0 665.28091 812 32.356995
## 4 0 1 665.28091 617 3.503853
## 5 1 2 264.87908 220 7.603968
## 6 2 3 41.70906 53 3.056541
feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,1)
feb_dist
## lowerz upperz expected observed residual
## 1 -3 -2 41.70906 33 1.818494
## 2 -2 -1 264.87908 202 14.926732
## 3 -1 0 665.28091 812 32.356995
## 4 0 1 665.28091 617 3.503853
## 5 1 2 264.87908 220 7.603968
## 6 2 3 41.70906 53 3.056541
#I have the residuals for each iteration of 1 unit between Z=-3 and Z=3 for the TMAX.
count_distribution_normal <- function(df, num_var, lower_z, upper_z, step_size) {
# Initialize an empty data frame
count_distribution <- data.frame(lowerz = numeric(),
upperz = numeric(),
expected = numeric(),
observed = numeric(),
residual = numeric())
# Calculate summary statistics
summary_stats <- df %>% summarise(df_mean = mean({{num_var}}, na.rm = TRUE),
df_sd = sd({{num_var}}, na.rm = TRUE))
# Add a z-score column to the data frame
df_with_z <- df %>% mutate(data_z = ({{num_var}} - summary_stats$df_mean) / summary_stats$df_sd)
# Calculate the number of steps
K = (upper_z - lower_z) / step_size
# Initialize the first interval
a = lower_z
b = lower_z + step_size
# Loop through each interval and compute the counts
for (i in 1:K) {
norm_area <- pnorm(b) - pnorm(a)
expected_count <- norm_area * nrow(df)
observed_count <- sum(between(df_with_z$data_z, a, b)) - sum(df_with_z$data_z == b)
residual <- (observed_count - expected_count)^2 / expected_count
# Append the results to the data frame
count_distribution <- rbind(count_distribution, data.frame(lowerz = a,
upperz = b,
expected = expected_count,
observed = observed_count,
residual = residual))
# Shift the interval up by the step size
a = a + step_size
b = b + step_size
}
# Return the final data frame
return(count_distribution)
}
# Call the function and print the result
feb_dist <- count_distribution_normal(feb_sea_weather, TMIN, -3, 3, 1)
print(feb_dist)
## lowerz upperz expected observed residual
## 1 -3 -2 41.70906 41 0.01205398
## 2 -2 -1 264.87908 192 20.05202014
## 3 -1 0 665.28091 641 0.88618595
## 4 0 1 665.28091 726 5.54173106
## 5 1 2 264.87908 322 12.31807040
## 6 2 3 41.70906 13 19.76093376
#I have the residuals for each iteration of 1 unit between Z=-3 and Z=3 for the TMIN
chi_square_bar_chart <- function(dist_table){
count_distribution_long <- dist_table %>%
pivot_longer(c(expected, observed),
names_to = "Group",
values_to = "Counts")
ggplot(count_distribution_long, aes(x = lowerz, y = Counts, fill=Group))+
geom_bar(stat="identity",position="dodge")
}
chi_square_graph_pvalue <- function(dist_table){
chi_square <- sum(dist_table$residual)
deg_freedom <- nrow(dist_table) - 1
p_value <- 1 - pchisq(chi_square, deg_freedom)
ggplot(data.frame(x=c(0, chi_square+20)), aes(x=x)) +
stat_function(fun=dchisq, args = c(df = deg_freedom)) +
geom_area(stat="function",
fun = dchisq,
args=c(df = deg_freedom),
fill = "blue",
xlim=c(chi_square, chi_square+20))+
geom_vline(xintercept = chi_square,color="blue") +
ggtitle(paste("chi-square =", chi_square, "; p-value =", p_value))
}
feb_dist<-count_distribution_normal(feb_sea_weather, TMIN,-3,3,1)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
#This is the chi-square for February TMIN Bar Chart and P-Value. Note
the P valueis extremely smaller than Alpha = 0.0001.
feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,1)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
#This is the chi-square for February TMAX bar chart and P-Value. Note
the P valueis extremely smaller than Alpha = 0.0001.
feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,0.5)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
# Ran a TMAX with step increments of 0.5 between Z values.
feb_dist<-count_distribution_normal(feb_sea_weather, TMIN,-3,3,0.5)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
# Ran a TMAX with step increments of 0.5 between Z values.
feb_dist<-count_distribution_normal(feb_sea_weather, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
#What do you notice? Does it appear your month’s low temperatures are
normally distributed or significantly different from normally
distributed? This bar chart compares the observed counts (in blue) of
low temperatures within different z-score intervals against the expected
counts (in red) if the temperatures followed a normal distribution.The
expected counts represent what you would expect if the data were
perfectly normally distributed.The observed counts do not perfectly
match the expected counts, particularly at the tails and some central
intervals.This visual again suggests that the low temperatures are not
perfectly normally distributed, as the observed counts show deviations
from what is expected under a normal distribution.The chi-square plot
shows the chi-square distribution, with the calculated chi-square
statistic for the data marked as a vertical blue line. The p-value is
also displayed.The chi-square statistic is very high (274.73), and the
p-value is 0, which indicates that the observed distribution of low
temperatures is significantly different from the expected normal
distribution.A p-value of 0 means there is a very strong evidence
against the null hypothesis, which in this case would be that the
temperatures are normally distributed. Statistically, the low
temperatures for the month are significantly different from a normal
distribution.
feb_dist<-count_distribution_normal(feb_sea_weather, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
#What do you notice? Does it appear your month’s high temperatures are
normally distributed or significantly different from normally
distributed? This bar chart compares the observed counts (in blue) of
high temperatures within different z-score intervals against the
expected counts (in red) if the temperatures followed a normal
distribution.The expected counts represent what you would expect if the
data were perfectly normally distributed.The observed counts show how
the actual data is distributed.There are noticeable discrepancies
between the expected and observed counts, especially in the tails of the
distribution (e.g., the lower and upper ends of the z-scores) and some
differences in the central part of the distribution.This visual suggests
that the high temperatures are not perfectly normally distributed, as
the observed counts do not consistently match the expected counts across
the range of z-scores.The chi-square plot shows the chi-square
distribution, with the calculated chi-square statistic for the data
marked as a vertical blue line. The p-value is also displayed.The
chi-square statistic is very high (273.55), and the p-value is 0, which
indicates that the observed distribution of temperatures is
significantly different from the expected normal distribution.A p-value
of 0 means there is a very strong evidence against the null hypothesis,
which in this case would be that the temperatures are normally
distributed. Statistically, the high temperatures for the month are
significantly different from a normal distribution.
set.seed(0220)
feb_sampled_days <- sea_weather %>%
filter(month == 2) %>%
slice_sample(n = 20)
# Display the sampled days
print(feb_sampled_days)
## DATE PRCP TMAX TMIN year month day
## 1 2007-02-19 0.74 46 41 2007 2 19
## 2 1968-02-11 0.00 54 30 1968 2 11
## 3 1985-02-13 0.00 46 30 1985 2 13
## 4 1950-02-09 0.02 45 37 1950 2 9
## 5 1969-02-08 1.05 48 37 1969 2 8
## 6 1971-02-10 0.31 53 46 1971 2 10
## 7 2010-02-12 0.38 54 43 2010 2 12
## 8 1991-02-04 0.68 58 49 1991 2 4
## 9 2007-02-06 0.03 48 41 2007 2 6
## 10 2016-02-02 0.01 50 35 2016 2 2
## 11 1964-02-17 0.12 46 42 1964 2 17
## 12 1972-02-09 0.00 44 33 1972 2 9
## 13 2012-02-05 0.00 57 35 2012 2 5
## 14 1958-02-17 0.40 57 44 1958 2 17
## 15 1949-02-14 0.18 38 30 1949 2 14
## 16 1978-02-08 0.09 57 43 1978 2 8
## 17 1979-02-25 0.43 58 45 1979 2 25
## 18 1994-02-15 0.65 48 44 1994 2 15
## 19 1993-02-02 0.00 57 38 1993 2 2
## 20 1996-02-13 0.00 59 37 1996 2 13
#Sample of 20 days from February (My Birthday Month)
# Sample of 20 from February with Z increments of 0.25-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
# Sample of 20 from February with Z increments of 0.50-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
# Sample of 20 from February with Z increments of 1-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMAX,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
#This chart compares the observed counts of high temperatures (in blue)
within different z-score intervals against the expected counts (in red)
if the temperatures followed a normal distribution.The observed counts
show some variation compared to the expected counts. However, the
discrepancies are not as pronounced as in the earlier charts.The
observed counts generally follow the expected distribution, with some
fluctuations.The visual suggests that the high temperatures are fairly
close to a normal distribution, though there are some small deviations.
The observed counts are not significantly different from the expected
counts.This plot shows the chi-square distribution, with the calculated
chi-square statistic for your data marked as a vertical blue line. The
p-value is also displayed.The chi-square statistic is 22.77, which is
relatively low, and the p-value is 0.474. This p-value is high enough to
indicate that the differences between the observed and expected
distributions are not statistically significant.A p-value above 0.05
generally suggests that you do not have enough evidence to reject the
null hypothesis that the data is normally distributed.Statistically, the
high temperatures for your birthday month do not appear to be
significantly different from a normal distribution. The p-value
indicates that the observed distribution could be consistent with a
normal distribution.
# Sample of 20 from February with Z increments of 0.25-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
# Sample of 20 from February with Z increments of 0.50-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
# Sample of 20 from February with Z increments of 1-Chi Square and Bar Chart
feb_dist<-count_distribution_normal(feb_sampled_days, TMIN,-3,3,0.25)
chi_square_bar_chart(feb_dist)
chi_square_graph_pvalue(feb_dist)
#This chart compares the observed counts of low temperatures (in blue)
within different z-score intervals against the expected counts (in red)
if the temperatures followed a normal distribution.The observed counts
show some variation compared to the expected counts. However, the
discrepancies are not as pronounced as in the earlier charts.The
observed counts generally follow the expected distribution, with some
fluctuations.The visual suggests that the low temperatures are fairly
close to a normal distribution, though there are some small deviations.
The observed counts are not significantly different from the expected
counts.This plot shows the chi-square distribution, with the calculated
chi-square statistic for your data marked as a vertical blue line. The
p-value is also displayed.The chi-square statistic is 22.77, which is
relatively low, and the p-value is 0.474. This p-value is high enough to
indicate that the differences between the observed and expected
distributions are not statistically significant.A p-value above 0.05
generally suggests that you do not have enough evidence to reject the
null hypothesis that the data is normally distributed.Statistically, the
low temperatures for your birthday month do not appear to be
significantly different from a normal distribution. The p-value
indicates that the observed distribution could be consistent with a
normal distribution.
library(tidytuesdayR)
library(readr)
# Load the data
ram <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-09-03/ram.csv", show_col_types = FALSE)
# Assign to a data frame
ram_df <- ram
# View the first few rows of the data frame
print(head(ram_df))
## # A tibble: 6 × 10
## chip_name capacity_bits bit_units ram_type transistor_count
## <chr> <dbl> <chr> <chr> <dbl>
## 1 N/A 1 Bits SRAM (cell) 6
## 2 N/A 1 Bits DRAM (cell) 1
## 3 ? 8 Bits SRAM (bipolar) 48
## 4 SP95 16 Bits SRAM (bipolar) 80
## 5 TMC3162 16 Bits SRAM (TTL) 96
## 6 ? NA <NA> SRAM (MOS) NA
## # ℹ 5 more variables: date_of_introduction <dbl>, manufacturer_s <chr>,
## # process <dbl>, area <dbl>, ref <chr>
glimpse(ram_df)
## Rows: 47
## Columns: 10
## $ chip_name <chr> "N/A", "N/A", "?", "SP95", "TMC3162", "?", "?", "…
## $ capacity_bits <dbl> 1, 1, 8, 16, 16, NA, 256, 64, 144, 256, 1, 1, 1, …
## $ bit_units <chr> "Bits", "Bits", "Bits", "Bits", "Bits", NA, "Bits…
## $ ram_type <chr> "SRAM (cell)", "DRAM (cell)", "SRAM (bipolar)", "…
## $ transistor_count <dbl> 6, 1, 48, 80, 96, NA, 256, 384, 864, 1536, 768, 3…
## $ date_of_introduction <dbl> 1963, 1965, 1965, 1965, 1966, 1966, 1968, 1968, 1…
## $ manufacturer_s <chr> "Fairchild", "Toshiba", "SDS, Signetics", "IBM", …
## $ process <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 12000, NA, 80…
## $ area <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, N…
## $ ref <chr> "[162]", "[163][164]", "[162]", "[165]", "[160]",…
library(GGally)
#create a vector of a group of numerical variables
num_vars <- c("capacity_bits", "transistor_count", "date_of_introduction", "area")
#pipe your data frame into select() to keep only the numerical variables of interest
#then pipe those variables into ggpairs() to plot scatterplots.
ram_df%>% select(all_of(num_vars)) %>% ggpairs()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 38 rows containing missing values
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 38 rows containing missing values
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 38 rows containing missing values
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_density()`).
ram_df <- ram_df %>% mutate("log(capacity_bits)" = log(capacity_bits))
#One variable seems to have a non-linear relationship with the other variables, we might be able to mutate it into a re-expressed format.I used R with the mutate() function. This creates a new column in my data frame that stores the logarithm of the numerical variable. If I reexpress one of your variables, I will be able to use that re-expressed variable.
ggplot(ram_df, aes(x = area, y=capacity_bits)) +
geom_point() +
stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
model <- lm(capacity_bits ~ area, ram_df)
summary(model)
##
## Call:
## lm(formula = capacity_bits ~ area, data = ram_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.40 -104.08 -46.06 148.54 177.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.285 60.775 1.930 0.0949 .
## area -0.289 1.094 -0.264 0.7992
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 130.9 on 7 degrees of freedom
## (38 observations deleted due to missingness)
## Multiple R-squared: 0.009873, Adjusted R-squared: -0.1316
## F-statistic: 0.0698 on 1 and 7 DF, p-value: 0.7992
#Plot of the scatterplot, for the correlation between the number of transistors in the chip and the year of introduction . The slope of this line of best fit represents the area units change for every unit increase of area
# Extract the slope (coefficient of x_var)
slope <- coef(model)[2]
cat("Slope:", slope, "\n")
## Slope: -0.2889913
# Calculate the correlation coefficient
correlation <- cor(ram_df$area, ram_df$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: NA
# Plot the data with the regression line
ggplot(ram_df, aes(x = area, y = capacity_bits)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatterplot with Regression Line",
x = "Area of Chip in Square Millimeters",
y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Predicted values (y_hat) from the model
ram_df$y_hat <- predict(model, newdata = ram_df)
# View the first few predicted values
head(ram_df$y_hat)
## 1 2 3 4 5 6
## NA NA NA NA NA NA
#Slope is -0.2889913. This means that for every 1 millimeter squared area for the size of chip increase there is a drop of -0.2889913 capacity of bits, I will filter out 25 millimeters squared as an outlier.
# Filter out rows where the year is 25 millimeters squared
ram_df_filtered <- ram_df %>%
filter(area != 25)
# Extract the slope (coefficient of x_var)
new_slope <- coef(model)[2]
cat("Slope:", new_slope, "\n")
## Slope: -0.2889913
# Calculate the correlation coefficient
correlation <- cor(ram_df_filtered$area, ram_df_filtered$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: -0.04952564
# Plot the data with the regression line
ggplot(ram_df_filtered, aes(x = area, y = capacity_bits)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatterplot with Regression Line-Outlier Removed",
x = "Area of Chip in Square Millimeters",
y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'
# Predicted values (y_hat) from the model
ram_df_filtered$y_hat <- predict(model, newdata = ram_df_filtered)
# View the first few predicted values
head(ram_df_filtered$y_hat)
## 1 2 3 4 5 6
## 114.3955 113.8175 112.0836 107.1707 105.4368 107.4597
#Outlier at 25 square millimeters essentially lowered the slope by half there fore was as influential as I thought. However I was able to get a correlation coefficient of 0.049 indicating almost no correlation between the physical area of the RAM chip and the bit capacity.
ram_sample_1 <- ram_df %>% slice_sample(n = 20)
ggplot(ram_sample_1, aes(x = area, y=capacity_bits)) +
geom_point() +
stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
model <- lm(capacity_bits ~ area, ram_sample_1)
summary(model)
##
## Call:
## lm(formula = capacity_bits ~ area, data = ram_sample_1)
##
## Residuals:
## 5 7 8
## -4.416 -97.156 101.572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -74.318 221.910 -0.335 0.794
## area 6.728 7.649 0.880 0.541
##
## Residual standard error: 140.6 on 1 degrees of freedom
## (17 observations deleted due to missingness)
## Multiple R-squared: 0.4362, Adjusted R-squared: -0.1276
## F-statistic: 0.7736 on 1 and 1 DF, p-value: 0.5407
view(ram_sample_1)
#The sample of 20 slope
ram_sample_1 <- ram_sample_1 %>%
filter(area != 25)
# Extract the slope (coefficient of x_var)
slope_new2 <- coef(model)[2]
cat("Slope:", slope_new2, "\n")
## Slope: 6.727811
# Calculate the correlation coefficient
correlation <- cor(ram_sample_1$area, ram_sample_1$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: 0.660443
# Plot the data with the regression line
ggplot(ram_df_filtered, aes(x = area, y = capacity_bits)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatterplot with Regression Line-Outlier Removed",
x = "Area of Chip in Square Millimeters",
y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'
# Predicted values (y_hat) from the model
ram_sample_1$y_hat <- predict(model, newdata = ram_sample_1)
# View the first few predicted values
head(ram_sample_1$y_hat)
## 1 2 3
## 6.416174 161.155819 154.428008
ram_sample_2 <- ram_df %>% slice_sample(n = 10)
ggplot(ram_sample_2, aes(x = area, y=capacity_bits)) +
geom_point() +
stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
model <- lm(capacity_bits ~ area, ram_sample_1)
summary(model)
##
## Call:
## lm(formula = capacity_bits ~ area, data = ram_sample_1)
##
## Residuals:
## 1 2 3
## -4.416 -97.156 101.572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -74.318 221.910 -0.335 0.794
## area 6.728 7.649 0.880 0.541
##
## Residual standard error: 140.6 on 1 degrees of freedom
## Multiple R-squared: 0.4362, Adjusted R-squared: -0.1276
## F-statistic: 0.7736 on 1 and 1 DF, p-value: 0.5407
view(ram_sample_2)
#The second sample. n=10 `
ram_sample_2 <- ram_sample_2 %>%
filter(area != 25)
# Extract the slope (coefficient of x_var)
slope_new3 <- coef(model)[2]
cat("Slope:", slope_new3, "\n")
## Slope: 6.727811
# Calculate the correlation coefficient
correlation <- cor(ram_sample_2$area, ram_sample_2$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: 1
# Plot the data with the regression line
ggplot(ram_df_filtered, aes(x = area, y = capacity_bits)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatterplot with Regression Line-Outlier Removed",
x = "Area of Chip in Square Millimeters",
y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'
# Predicted values (y_hat) from the model
ram_sample_2$y_hat <- predict(model, newdata = ram_sample_2)
# View the first few predicted values
head(ram_sample_2$y_hat)
## 1 2
## 6.416174 161.155819
#When n=10 the slope was less reliable. The Pearson r was na
ram_sample_3 <- ram_df %>% slice_sample(n = 50)
ggplot(ram_sample_3, aes(x = area, y=capacity_bits)) +
geom_point() +
stat_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).
model <- lm(capacity_bits ~ area, ram_sample_3)
summary(model)
##
## Call:
## lm(formula = capacity_bits ~ area, data = ram_sample_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.40 -104.08 -46.06 148.54 177.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.285 60.775 1.930 0.0949 .
## area -0.289 1.094 -0.264 0.7992
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 130.9 on 7 degrees of freedom
## (38 observations deleted due to missingness)
## Multiple R-squared: 0.009873, Adjusted R-squared: -0.1316
## F-statistic: 0.0698 on 1 and 7 DF, p-value: 0.7992
view(ram_sample_3)
ram_sample_3 <- ram_sample_3 %>%
filter(area != 25)
# Extract the slope (coefficient of x_var)
slope_new4 <- coef(model)[2]
cat("Slope:", slope_new4, "\n")
## Slope: -0.2889913
# Calculate the correlation coefficient
correlation <- cor(ram_sample_3$area, ram_sample_3$capacity_bits)
cat("Correlation Coefficient:", correlation, "\n")
## Correlation Coefficient: -0.04952564
# Plot the data with the regression line
ggplot(ram_sample_3, aes(x = area, y = capacity_bits)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatterplot with Regression Line-Outlier Removed",
x = "Area of Chip in Square Millimeters",
y = "Capacity Bit Units")
## `geom_smooth()` using formula = 'y ~ x'
# Predicted values (y_hat) from the model
ram_sample_3$y_hat <- predict(model, newdata = ram_sample_3)
# View the first few predicted values
head(ram_sample_3$y_hat)
## 1 2 3 4 5 6
## 105.43680 113.81754 112.08360 74.51473 107.45974 114.39553
#The slope and correlation reading is the same with 50 as it was with 20
rep_regression_samples <- function(ram_df, num_var1, num_var2, N, T) {
# Initialize an empty data frame and title the columns)
rep_samples <- data.frame(matrix(ncol=4, nrow = 1))
colnames(rep_samples) <- c("sample_slope", "slope_t", "sample_corr", "corr_t")
#creates a data frame internal to the function to help with variable titles
tmp <- ram_df %>% select({{num_var1}}, {{num_var2}})
colnames(tmp) <- c("x", "y")
# Repeatedly draw samples from your population of size N.
# Compute the slope and correlation as well as the t-scores of each
# Then add the results to the data frame
for (i in 1:T) {
sample <- tmp %>% slice_sample(n = N)
model <- lm(y ~ x, sample)
sample_slope <- model$coefficients[2]
slope_SE <- sqrt( (sum( (model$residuals)^2) ) / (N-2)) * (1/(sd(sample$x)*sqrt(N - 1)))
slope_t <- sample_slope / slope_SE
sample_corr <- cor(sample$x, sample$y)
corr_t <- sample_corr*sqrt( (N-2) / (1-sample_corr^2) )
rep_samples[i,] <- c(sample_slope, slope_t, sample_corr, corr_t)
}
# sets the output of the function to be the data frame generated
rep_samples
}
my_rep_samples <- rep_regression_samples(ram_df, area, capacity_bits, 20, 100)
print(head(my_rep_samples))
## sample_slope slope_t sample_corr corr_t
## 1 -0.4461785 NA NA NA
## 2 -0.1394102 NA NA NA
## 3 11.0056391 NA NA NA
## 4 11.1627907 NA NA NA
## 5 -0.2619231 NA NA NA
## 6 -0.2594152 NA NA NA
# Plot a histogram of the sample_slope column
library(ggplot2)
ggplot(my_rep_samples, aes(x = sample_slope)) +
geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +
labs(title = "Histogram of Sample Slopes",
x = "Sample Slope",
y = "Frequency")
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Choose an alpha level
alpha <- 0.05
# Calculate the degrees of freedom
# Assuming you have n observations in each sample
n <- length(my_rep_samples$sample_slope) # or use the specific sample size
deg_freedom <- n - 2
# Find the critical t value
critical_t <- qt(alpha / 2, deg_freedom, lower.tail = FALSE)
critical_t
## [1] 1.984467
# Calculate the proportion of samples where the absolute t-value exceeds the critical t-value
reject_null_proportion <- mean(abs(my_rep_samples$slope_t) > critical_t)
# Print the result
cat("Proportion of samples rejecting the null hypothesis:", reject_null_proportion)
## Proportion of samples rejecting the null hypothesis: NA
#I noticed that the slopes were mostly denser arounf 0 but there were wide spead of outliers indicsting the data is not normally distributed at alpha 0.05 ans unable to say what proportion were rejecting the null (na). When increasing alpha to 0.10 the conclusion was essentially the same.
# Plot a histogram of the sample_slope column
library(ggplot2)
ggplot(my_rep_samples, aes(x = sample_slope)) +
geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +
labs(title = "Histogram of Sample Slopes",
x = "Sample Slope",
y = "Frequency")
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Choose an alpha level
alpha <- 0.10
# Calculate the degrees of freedom
# Assuming you have n observations in each sample
n <- length(my_rep_samples$sample_slope) # or use the specific sample size
deg_freedom <- n - 2
# Find the critical t value
critical_t <- qt(alpha / 2, deg_freedom, lower.tail = FALSE)
critical_t
## [1] 1.660551
# Calculate the proportion of samples where the absolute t-value exceeds the critical t-value
reject_null_proportion <- mean(abs(my_rep_samples$slope_t) > critical_t)
# Print the result
cat("Proportion of samples rejecting the null hypothesis:", reject_null_proportion)
## Proportion of samples rejecting the null hypothesis: NA
# Check column names
names(ram_df)
## [1] "chip_name" "capacity_bits" "bit_units"
## [4] "ram_type" "transistor_count" "date_of_introduction"
## [7] "manufacturer_s" "process" "area"
## [10] "ref" "log(capacity_bits)" "y_hat"
# Check the structure of the data frame
str(ram_df)
## tibble [47 × 12] (S3: tbl_df/tbl/data.frame)
## $ chip_name : chr [1:47] "N/A" "N/A" "?" "SP95" ...
## $ capacity_bits : num [1:47] 1 1 8 16 16 NA 256 64 144 256 ...
## $ bit_units : chr [1:47] "Bits" "Bits" "Bits" "Bits" ...
## $ ram_type : chr [1:47] "SRAM (cell)" "DRAM (cell)" "SRAM (bipolar)" "SRAM (bipolar)" ...
## $ transistor_count : num [1:47] 6 1 48 80 96 ...
## $ date_of_introduction: num [1:47] 1963 1965 1965 1965 1966 ...
## $ manufacturer_s : chr [1:47] "Fairchild" "Toshiba" "SDS, Signetics" "IBM" ...
## $ process : num [1:47] NA NA NA NA NA NA NA NA NA 12000 ...
## $ area : num [1:47] NA NA NA NA NA NA NA NA NA NA ...
## $ ref : chr [1:47] "[162]" "[163][164]" "[162]" "[165]" ...
## $ log(capacity_bits) : num [1:47] 0 0 2.08 2.77 2.77 ...
## $ y_hat : Named num [1:47] NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "names")= chr [1:47] "1" "2" "3" "4" ...
# Create a binary indicator variable for Hitachi
ram_df$Hitachi <- ifelse(ram_df$manufacturer_s == "Hitachi", 1, 0)
# Now fit the model
model <- lm(capacity_bits ~ Hitachi, data = ram_df)
summary(model)
##
## Call:
## lm(formula = capacity_bits ~ Hitachi, data = ram_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.727 -58.727 -46.727 1.273 225.273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.73 13.87 4.524 4.56e-05 ***
## Hitachi -58.23 66.50 -0.876 0.386
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 91.97 on 44 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01713, Adjusted R-squared: -0.005211
## F-statistic: 0.7667 on 1 and 44 DF, p-value: 0.386
#Looking for Hitachi and modeled with capacity_bits # In the coefficients section of the summary, you are given one row for (Intercept) and another row for one of your categorical values, whichever one is second in alphabetical order. The categorical values. In my case Hitachi. The equation is yhat=62.75-58.23x at yhat(1) = 62.75-58.23=4.52. he coefficient -58.23 is a measurement of how yhat changes when the indicated condition becomes true compared to the other condition.
# Subset the data to include only the selected manufacturers
subset_df <- ram_df %>%
filter(manufacturer_s %in% c("Hitachi", "IBM", "Toshiba"))
# View the first few rows of the subset data frame
head(subset_df)
## # A tibble: 6 × 13
## chip_name capacity_bits bit_units ram_type transistor_count
## <chr> <dbl> <chr> <chr> <dbl>
## 1 N/A 1 Bits DRAM (cell) 1
## 2 SP95 16 Bits SRAM (bipolar) 80
## 3 ? 8 kb DRAM (PMOS) 8192
## 4 ? 4 kb SRAM (CMOS) 24576
## 5 ? 288 kb DRAM 294912
## 6 ? 256 kb SRAM (CMOS) 1572864
## # ℹ 8 more variables: date_of_introduction <dbl>, manufacturer_s <chr>,
## # process <dbl>, area <dbl>, ref <chr>, `log(capacity_bits)` <dbl>,
## # y_hat <dbl>, Hitachi <dbl>
#Correlating Hitachi versus Toshiba and IBM
# Fit the linear model using the new variable
model <- lm(capacity_bits ~ manufacturer_s, data = ram_df)
# Print a summary of the linear model
summary(model)
##
## Call:
## lm(formula = capacity_bits ~ manufacturer_s, data = ram_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.000 -48.000 0.000 1.125 207.000
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 107.00 60.90
## manufacturer_sFujitsu, NEC -91.00 121.79
## manufacturer_sGeneral Instrument -105.00 121.79
## manufacturer_sHitachi -102.50 96.28
## manufacturer_sHitachi, NEC 149.00 121.79
## manufacturer_sHitachi, Toshiba -91.00 121.79
## manufacturer_sHyundai -104.50 96.28
## manufacturer_sIBM -3.00 86.12
## manufacturer_sIntel -58.00 72.78
## manufacturer_sIntel, Honeywell -106.00 121.79
## manufacturer_sMatsushita -43.00 121.79
## manufacturer_sMatsushita, Mitsubishi, Fujitsu, Toshiba -43.00 121.79
## manufacturer_sMitsubishi -106.00 121.79
## manufacturer_sNEC -25.80 77.03
## manufacturer_sNEC, Toshiba, Hitachi, Mitsubishi -103.00 121.79
## manufacturer_sNTT 5.00 86.12
## manufacturer_sSamsung -63.00 74.58
## manufacturer_sSDS, Signetics -99.00 121.79
## manufacturer_sSiemens -43.00 121.79
## manufacturer_sToshiba -20.00 86.12
## manufacturer_sTransitron -91.00 121.79
## t value Pr(>|t|)
## (Intercept) 1.757 0.0911 .
## manufacturer_sFujitsu, NEC -0.747 0.4619
## manufacturer_sGeneral Instrument -0.862 0.3968
## manufacturer_sHitachi -1.065 0.2973
## manufacturer_sHitachi, NEC 1.223 0.2326
## manufacturer_sHitachi, Toshiba -0.747 0.4619
## manufacturer_sHyundai -1.085 0.2881
## manufacturer_sIBM -0.035 0.9725
## manufacturer_sIntel -0.797 0.4330
## manufacturer_sIntel, Honeywell -0.870 0.3924
## manufacturer_sMatsushita -0.353 0.7270
## manufacturer_sMatsushita, Mitsubishi, Fujitsu, Toshiba -0.353 0.7270
## manufacturer_sMitsubishi -0.870 0.3924
## manufacturer_sNEC -0.335 0.7405
## manufacturer_sNEC, Toshiba, Hitachi, Mitsubishi -0.846 0.4057
## manufacturer_sNTT 0.058 0.9542
## manufacturer_sSamsung -0.845 0.4063
## manufacturer_sSDS, Signetics -0.813 0.4240
## manufacturer_sSiemens -0.353 0.7270
## manufacturer_sToshiba -0.232 0.8182
## manufacturer_sTransitron -0.747 0.4619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105.5 on 25 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2656, Adjusted R-squared: -0.322
## F-statistic: 0.452 on 20 and 25 DF, p-value: 0.963
#My yhat=4.50 + 99.5x_1 +82.5x_2 This gives the expected estimate when correlating IBM and Toshiba verus Hitachi
# Choose three of the numerical variables from your data set, and create a linear model using these three variables.
model_3_var <- lm(capacity_bits ~ area + transistor_count, ram_df)
summary(model_3_var)
##
## Call:
## lm(formula = capacity_bits ~ area + transistor_count, data = ram_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -134.827 -56.409 -5.762 22.651 182.208
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.146e+01 9.711e+01 -0.736 0.4895
## area 7.842e+00 3.729e+00 2.103 0.0802 .
## transistor_count -6.373e-05 2.842e-05 -2.242 0.0661 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104.3 on 6 degrees of freedom
## (38 observations deleted due to missingness)
## Multiple R-squared: 0.4613, Adjusted R-squared: 0.2817
## F-statistic: 2.569 on 2 and 6 DF, p-value: 0.1563
#This gives us the intercept and the slope for this multiple regression model #The first slope is saying that for 1 unit increase in chip area (mm squared), the bit capacity will respond with a decrease of 0.7146 bits (-0.7146 bits). The second slope is saying that for every one unit increase in transistor count, the response will be a decrease of 7.82 bit units. Since the p-value for each of the explanatory variables (transistor count and chip area) is less greater than 0.05 alpha, we do not have enough to declare that there is a significant impact. Both area and transistor_count show marginal significance individually (p-values slightly above 0.05), indicating a potential but not strong relationship with capacity_bits.The overall model is not statistically significant (p = 0.1563), suggesting that the explanatory variables, when considered together, do not have a strong relationship with capacity_bits.
model_sub<-lm(area ~ capacity_bits + date_of_introduction+ram_type, ram_df)
summary(model_sub)
##
## Call:
## lm(formula = area ~ capacity_bits + date_of_introduction + ram_type,
## data = ram_df)
##
## Residuals:
## 12 14 16 21 22 24 25
## 3.003e-01 -4.554e-01 1.551e-01 3.282e-03 -6.661e-16 3.498e+00 -3.502e+00
## 27 31
## 8.743e-16 2.776e-17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.486e+03 3.352e+03 -1.637 0.2434
## capacity_bits -3.049e-02 3.531e-02 -0.863 0.4789
## date_of_introduction 2.786e+00 1.696e+00 1.643 0.2421
## ram_typeDRAM (CMOS) 9.799e+01 1.927e+01 5.084 0.0366 *
## ram_typeDRAM (NMOS) 1.431e+01 4.669e+00 3.065 0.0920 .
## ram_typeDRAM (PMOS) 6.598e+00 1.149e+01 0.574 0.6239
## ram_typeDRAM (VMOS) -1.257e+00 7.582e+00 -0.166 0.8836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.523 on 2 degrees of freedom
## (38 observations deleted due to missingness)
## Multiple R-squared: 0.9983, Adjusted R-squared: 0.9931
## F-statistic: 192.1 on 6 and 2 DF, p-value: 0.005188
#Model with 4 numerical and 2 categorical variables.The final regression equation that comes from the reduction process is area=−5486+(−0.03049×capacity_bits)+(2.786×date_of_introduction)+(97.99×ram_typeDRAM (CMOS))+(14.31×ram_typeDRAM (NMOS))+(6.598×ram_typeDRAM (PMOS))+(−1.257×ram_typeDRAM (VMOS)). Note that yhat is the response variable “area”.The ram_type variable for “DRAM (CMOS)” is significantly associated with the area, suggesting that this type of RAM has a considerable positive impact on the area.The ram_type variable for “DRAM (NMOS)” has a marginally significant effect, while other ram_type categories do not show a significant relationship with area, while capacity_bits and date_of_introduction do not have significant relationships with area based on the p-values provided, meaning that changes in these variables do not predict area reliably within this model.This model shows that certain types of RAM (DRAM (CMOS) and, to a lesser extent, DRAM (NMOS)) significantly affect the area, while the impact of capacity_bits and date_of_introduction is less clear.