New DATA1901 Project

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
file <- "/Users/davidlu/Documents/Uni Stuff/DATA1901/1901_data.csv"
data <- read.csv(file)

data_excluded <- filter(data, gid_pid != "nismwt_28")

data_1 <- mutate(data, agase_total = agase_1 + agase_2 + agase_3 + agase_4 + agase_5 + agase_6 + agase_7 + agase_8 + agase_9 + agase_10)
#we actually don't need symptom change, it is given by the variable tot
data_2 <- mutate(data_1, symptom_change = agase_total - baseline_gase)
#see if we need to exclude high baseline donny

data_excluded_1 <- mutate(data_excluded, agase_total = agase_1 + agase_2 + agase_3 + agase_4 + agase_5 + agase_6 + agase_7 + agase_8 + agase_9 + agase_10)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(rafalib)
#see how to get rid of warning
#may ned to remove outlier (find out which is outlier first)
#what other graphs? --> positive experience, vs. no site of that (as that happens before)
#intervention vs no intervention should only be relevant for baseline gase
#outlier greater than 6 or mean of 4 as per study
#note that removing the outlier, who is male, does not affect the analysis - female symptom reporting is already innately higher
#analysis should see if this is a social cause, a physiological cause, or combined
# in fact men may actually be sicker than women, but not report it ("man flu")
#do we care about the absolute value of the symptom change?
data_2_tot_barplot <- data_2

male_filtered <- filter(data_2, gender == "Male")
male_filtered_excluded <- filter(data_excluded_1, gender == "Male")

female_filtered <- filter(data_2, gender == "Female")
female_filtered_excluded <- filter(data_excluded_1, gender == "Female")
#the data from the excluded shown below does not affect our results much at all

gender_histogram_change <- plot_ly(histnorm = "probability") %>%
  add_histogram(x = male_filtered$tot, name = "Men", nbinsx = 25, opacity = 1.0) %>%
  add_histogram(x = female_filtered$tot, name = "Women", nbinsx = 25, opacity = 1.0) %>%
  layout(title = "Male and Female Symptom Changes", 
         xaxis = list(title = "Symptom Change"),
         yaxis = list(title = "Frequency"))
     
gender_histogram_change
gender_histogram_change_outlier_excluded <- plot_ly(histnorm = "probability") %>%
  add_histogram(x = male_filtered_excluded$tot, name = "Men", nbinsx = 25, opacity = 1.0) %>%
  add_histogram(x = female_filtered_excluded$tot, name = "Women", nbinsx = 25, opacity = 1.0) %>%
  layout(title = "Male and Female Symptom Changes - Outlier Removed", 
         xaxis = list(title = "Symptom Change"),
         yaxis = list(title = "Frequency"))
     
gender_histogram_change_outlier_excluded
sd(female_filtered$tot)
[1] 2.72394
sd(male_filtered$tot)
[1] 1.874904
sd(female_filtered_excluded$tot)
[1] 2.72394
sd(male_filtered_excluded$tot)
[1] 1.882669
male_filtered_tot_mean = mean(abs(male_filtered$tot))
female_filtered_tot_mean = mean(abs(female_filtered$tot))
#using absolute values here to see how much symptoms change (is this relevant?)
data_2_tot_barplot <- data.frame(
  Gender = c("Male", "Female"),
  Average_Symptom = c(male_filtered_tot_mean, female_filtered_tot_mean)
)
ggplot(data_2_tot_barplot, aes(x = Gender, y = Average_Symptom)) +
  geom_bar(stat = "identity") +
  labs(title = "Mean Symptom Change by Gender", x = "Gender", y = "Mean Symptom Change")

male_filtered_tot_mean
[1] 1.234043
female_filtered_tot_mean
[1] 1.6
data_2_agase_barplot <- data_2

gender_histogram_agase <- plot_ly(histnorm = "probability") %>%
  add_histogram(x = male_filtered$agase_total, name = "Men", nbinsx = 15, opacity = 1.0) %>%
  add_histogram(x = female_filtered$agase_total, name = "Women", nbinsx = 15, opacity = 1.0) %>%
  layout(title = "Male and Female Active GASE", 
         xaxis = list(title = "Active GASE"),
         yaxis = list(title = "Frequency"))
     
gender_histogram_agase
sd(female_filtered$agase_total)
[1] 3.362998
sd(male_filtered$agase_total)
[1] 2.394103
summary(female_filtered$agase_total)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.00   10.00   11.00   12.42   13.00   32.00 
summary(male_filtered$agase_total)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.00   10.00   11.00   12.09   13.50   20.00 
male_filtered_agase_mean = mean(abs(male_filtered$agase_total))
female_filtered_agase_mean = mean(abs(female_filtered$agase_total))
data_2_agase_barplot <- data.frame(
  Gender = c("Male", "Female"),
  Active_Gase = c(male_filtered_agase_mean, female_filtered_agase_mean)
)
ggplot(data_2_agase_barplot, aes(x = Gender, y = Active_Gase)) +
  geom_bar(stat = "identity") +
  labs(title = "Mean Active GASE by Gender", x = "Gender", y = "Mean Active GASE")

male_filtered_agase_mean
[1] 12.08511
female_filtered_agase_mean
[1] 12.41818
data_2_bgase_barplot <- data_2

gender_histogram_baseline <- plot_ly(histnorm = "probability") %>%
  add_histogram(x = male_filtered$baseline_gase, name = "Men", nbinsx = 15, opacity = 1.0) %>%
  add_histogram(x = female_filtered$baseline_gase, name = "Women", nbinsx = 15, opacity = 1.0) %>%
  layout(title = "Male and Female Baseline GASE", 
         xaxis = list(title = "Baseline GASE"),
         yaxis = list(title = "Frequency"))
     
gender_histogram_baseline
sd(female_filtered$baseline_gase)
[1] 3.302841
sd(male_filtered$baseline_gase)
[1] 2.454394
summary(female_filtered$baseline_gase)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.00   10.00   10.00   11.84   12.00   31.00 
summary(male_filtered$baseline_gase)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.00   10.00   11.00   11.62   12.00   21.00 
male_filtered_bgase_mean = mean(abs(male_filtered$baseline_gase))
female_filtered_bgase_mean = mean(abs(female_filtered$baseline_gase))
data_2_bgase_barplot <- data.frame(
  Gender = c("Male", "Female"),
  Baseline_Gase = c(male_filtered_bgase_mean, female_filtered_bgase_mean)
)
ggplot(data_2_bgase_barplot, aes(x = Gender, y = Baseline_Gase)) +
  geom_bar(stat = "identity") +
  labs(title = "Mean Baseline GASE by Gender", x = "Gender", y = "Mean Baseline GASE")

male_filtered_bgase_mean
[1] 11.61702
female_filtered_bgase_mean
[1] 11.83636