##installing packages
install.packages('plyr', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/madel/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'plyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\madel\AppData\Local\Temp\RtmpENG8kT\downloaded_packages
library(readr)
#install.packages("ggplot2")
library(ggplot2)
bca <- read_csv("BrainCancerData1.csv") #"BrainCancerData1.csv" is a mouthful so we assign it to a shorter variable
## Rows: 48 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): year, rate_cases1, rate_cases2, death_rate, pct_survival
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##Part A
# Question one: define data type. Numbers are values (eg. int), characters have quotes around them (strings), logical are true or false (boolean), factor is categorical data with defined levels ("a", "b", and "c" are all letters, for example)
str(bca)
## spc_tbl_ [48 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ year : num [1:48] 1975 1976 1977 1978 1979 ...
## $ rate_cases1 : num [1:48] NA NA NA NA NA NA NA NA NA NA ...
## $ rate_cases2 : num [1:48] 5.93 5.86 6.29 5.8 6.12 6.57 6.53 6.39 6.37 5.96 ...
## $ death_rate : num [1:48] 4.11 4.34 4.4 4.53 4.26 4.37 4.36 4.43 4.39 4.55 ...
## $ pct_survival: num [1:48] 23.5 22.6 22.3 25.2 23.1 ...
## - attr(*, "spec")=
## .. cols(
## .. year = col_double(),
## .. rate_cases1 = col_double(),
## .. rate_cases2 = col_double(),
## .. death_rate = col_double(),
## .. pct_survival = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# console tells me that each column data type is a double. doubles are numbers, so that makes sense.
# Question two: index the column rate_cases1 from the dataset. I guess that means to single it out? To select it specifically.
bca$rate_cases1 #this prints out the values only in the rate_cases1 column
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [16] NA NA 6.73 6.50 6.32 6.30 6.38 6.48 6.47 6.67 6.38 6.29 6.44 6.49 6.54
## [31] 6.54 6.26 6.43 6.40 6.63 6.31 6.15 6.25 6.31 6.13 6.40 6.07 6.18 6.17 5.87
## [46] 5.90 5.67 NA
bca[1,] #this prints out the first row (year 1975)
## # A tibble: 1 × 5
## year rate_cases1 rate_cases2 death_rate pct_survival
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1975 NA 5.93 4.11 23.5
bca[2,] #this prints out the second row (year 1976)
## # A tibble: 1 × 5
## year rate_cases1 rate_cases2 death_rate pct_survival
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1976 NA 5.86 4.34 22.6
#oh cool so that's what it does. teehee.
##Part B
mean(bca$rate_cases1, na.rm = TRUE) #calculate the mean
## [1] 6.322
avr_rate_cases1 <- mean(bca$rate_cases1, na.rm = TRUE) #assign this to a variable
sd(bca$rate_cases1, na.rm = TRUE) #calculate the SD
## [1] 0.236372
sd_rate_cases1 <- mean(bca$rate_cases1, na.rm = TRUE) #assign this to a variable
range(bca$rate_cases1, na.rm = TRUE) #calculate the range
## [1] 5.67 6.73
rg_rate_cases1 <- mean(bca$rate_cases1, na.rm = TRUE) #assign this to a variable
#Question 1.1: Calculate the mean of the rate of incidence for rate_cases2
mean(bca$rate_cases2, na.rm = TRUE) #calculate the mean
## [1] 6.561489
avr_rate_cases <- mean(bca$rate_cases2, na.rm = TRUE) #assign this to a variable
#Question 1.2: Calculate the mean of the rate of incidence for death_rate
mean(bca$death_rate, na.rm = TRUE) #calculate the mean
## [1] 4.483958
avr_death_rate <- mean(bca$death_rate, na.rm = TRUE) #assign this to a variable
#Question 1.3: Calculate the mean of the rate of incidence for pct_survival
mean(bca$pct_survival, na.rm = TRUE) #calculate the mean
## [1] 31.21929
avr_pct_survival <- mean(bca$pct_survival, na.rm = TRUE) #assign this to a variable
#Question 2.1: Calculate the SD of the rate of incidence for rate_cases2
sd(bca$rate_cases2, na.rm = TRUE) #calculate the SD
## [1] 0.3297692
sd_rate_cases <- sd(bca$rate_cases1, na.rm = TRUE) #assign this to a variable
#Question 2.2: Calculate the SD of the rate of incidence for death_rate
sd(bca$death_rate, na.rm = TRUE) #calculate the SD
## [1] 0.1983092
sd_death_rate <- sd(bca$death_rate, na.rm = TRUE) #assign this to a variable
#Question 2.3: Calculate the SD of the rate of incidence for pct_survival
sd(bca$pct_survival, na.rm = TRUE) #calculate the SD
## [1] 4.832159
sd_pct_survival <- sd(bca$pct_survival, na.rm = TRUE) #assign this to a variable
#Question 3.1: Calculate the range of the rate of incidence for rate_cases2
range(bca$rate_cases1, na.rm = TRUE) #calculate the range
## [1] 5.67 6.73
rg_rate_cases <- sd(bca$rate_cases2, na.rm = TRUE) #assign this to a variable
#Question 3.2: Calculate the range of the rate of incidence for death_rate
range(bca$death_rate, na.rm = TRUE) #calculate the range
## [1] 4.11 4.95
rg_death_rate <- sd(bca$death_rate, na.rm = TRUE) #assign this to a variable
#Question 3.3: Calculate the range of the rate of incidence for pct_survival
range(bca$pct_survival, na.rm = TRUE) #calculate the range
## [1] 22.33 37.93
rg_pct_survival <- sd(bca$pct_survival, na.rm = TRUE) #assign this to a variable
#4.1. The mean value of “rate_cases1” was calculated to be 6.322. What does this tell you about the average rate of new brain cancer cases?
#On average, there were approximately six new cases per year.
#4.2 The standard deviation value of “rate_cases1” was calculated to be 0.236372. How does this value help us understand the variability of the new brain cancer cases in the dataset?
#Low sd indicates that most values remained relatively close to the mean.
#4.3.Given that the range of “rate_cases1” is from 5.67 to 6.73, what does this suggest about the spread of new brain cancer cases, and how does this relate to the mean and standard deviation values?
#New cases fall within a narrow range that cluster closely around the mean. The rate of cancer incidence is stable.
#4.4. The mean death rate was calculated to be 4.484, with a standard deviation of 0.1983.What does this low
#standard deviation tell you about the consistency of death rates in the dataset, and how might this information be important for public health analysis?
#Death rates are consistent, the sd is low, and this is an indication that treatments, interventions, etc. over the years has mostly uniform impact. No significant differences or deviations.
#4.5. The mean of “pct_survival” was calculated as 31.22, while its standard deviation is 4.8322. What do these values tell us about the survival rates in the dataset? Are the survival rates more or less consistent compared to the new brain cancer rates?
#Survival rates are more varied around the mean. The high sd may indicate that survival outcomes vary significantly among individuals.
#Part C
#linear regression graph. y = mx + b; y = dependent variable... etc.
lm(formula = bca)
##
## Call:
## lm(formula = bca)
##
## Coefficients:
## (Intercept) rate_cases1 rate_cases2 death_rate pct_survival
## 2179.1530 -15.7481 0.4679 -20.2839 0.3856
lm(formula = bca$pct_survival ~ bca$year)
##
## Call:
## lm(formula = bca$pct_survival ~ bca$year)
##
## Coefficients:
## (Intercept) bca$year
## -682.2565 0.3575
linreg <- lm(formula = bca$pct_survival ~ bca$year, data = bca)
summary(linreg)
##
## Call:
## lm(formula = bca$pct_survival ~ bca$year, data = bca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.269 -1.617 -0.091 1.784 3.142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -682.25645 52.14369 -13.08 4.89e-16 ***
## bca$year 0.35754 0.02613 13.68 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.053 on 40 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.824, Adjusted R-squared: 0.8196
## F-statistic: 187.2 on 1 and 40 DF, p-value: < 2.2e-16
#visualize the data with a ggplot
ggplot(data = bca, aes(x = year, y = pct_survival)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x="Year", y = "Survival (%)", title = "Brain Cancer: Year vs. Survival (%)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
