packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
               "fst", "viridis", "knitr", "rmarkdown", "ggridges", "viridis", "questionr", "flextable", "infer")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]

if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'fst' was built under R version 4.3.3
## Loading required package: viridisLite
## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "rmarkdown"    "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "ggridges"     "rmarkdown"    "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "flextable"    "questionr"    "ggridges"     "rmarkdown"    "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[13]]
##  [1] "infer"        "flextable"    "questionr"    "ggridges"     "rmarkdown"   
##  [6] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
## [11] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"

R Markdown

QUESTION 1

In your own words, very briefly describe the dataset (what type of survey it is) you have chosen and why (max 200 words).

setwd("C:/Users/Erika/Desktop/SOC_202_YAY"
)

library(readr)

DoctorVisits <- read_csv("DoctorVisits.csv")
## Rows: 5190 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): gender, private, freepoor, freerepat, nchronic, lchronic
## dbl (7): rownames, visits, age, income, illness, reduced, health
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(DoctorVisits)
## # A tibble: 5,190 × 13
##    rownames visits gender   age income illness reduced health private freepoor
##       <dbl>  <dbl> <chr>  <dbl>  <dbl>   <dbl>   <dbl>  <dbl> <chr>   <chr>   
##  1        1      1 female  0.19   0.55       1       4      1 yes     no      
##  2        2      1 female  0.19   0.45       1       2      1 yes     no      
##  3        3      1 male    0.19   0.9        3       0      0 no      no      
##  4        4      1 male    0.19   0.15       1       0      0 no      no      
##  5        5      1 male    0.19   0.45       2       5      1 no      no      
##  6        6      1 female  0.19   0.35       5       1      9 no      no      
##  7        7      1 female  0.19   0.55       4       0      2 no      no      
##  8        8      1 female  0.19   0.15       3       0      6 no      no      
##  9        9      1 female  0.19   0.65       2       0      5 yes     no      
## 10       10      1 male    0.19   0.15       1       0      0 yes     no      
## # ℹ 5,180 more rows
## # ℹ 3 more variables: freerepat <chr>, nchronic <chr>, lchronic <chr>
view(DoctorVisits)

I chose a dataset titled “Australian Health Service Utilization Data”. This Cross-section data is derived from the Australian Health Survey 1977-78 which contains information on the number of consultations with a doctor or specialist in the two-week period before an interview The sample consists of 5190 individuals over 18 years old who answered questions related to doctor visits. The explanatory variables in the model contain 12 variables. I chose it because I found the focus of the research interesting to explore. The study’s primary focus is on the relationship between insurance levels and health care use, that is whether a high insurance level caused individuals to ‘consume’ more health care services, and on the role of income.

visits Number of doctor visits in past 2 weeks.

gender Factor indicating gender.

age Age in years divided by 100.

income Annual income in tens of thousands of dollars.

illness Number of illnesses in past 2 weeks.

reduced Number of days of reduced activity in past 2 weeks due to illness or injury.

health General health questionnaire score using Goldberg’s method.

private Factor. Does the individual have private health insurance?

freepoor Factor. Does the individual have free government health insurance due to low income?

freerepat Factor. Does the individual have free government health insurance due to old age, disability or veteran status? - Basic level of insurance coverage provided free of charge to select individuals on the basis of low income , pensioner or repatriation status.

nchronic Factor. Is there a chronic condition not limiting activity?

lchronic Factor. Is there a chronic condition limiting activity?


QUESTION 2

Univariate relationships: Provide a summary table of the variables you have and describe the distributions. Maximum 2 table. Maximum 2 figures. Max 150 words explanation of the distributions.

# Basic descriptive statistics
summary(DoctorVisits)
##     rownames        visits          gender               age        
##  Min.   :   1   Min.   :0.0000   Length:5190        Min.   :0.1900  
##  1st Qu.:1298   1st Qu.:0.0000   Class :character   1st Qu.:0.2200  
##  Median :2596   Median :0.0000   Mode  :character   Median :0.3200  
##  Mean   :2596   Mean   :0.3017                      Mean   :0.4064  
##  3rd Qu.:3893   3rd Qu.:0.0000                      3rd Qu.:0.6200  
##  Max.   :5190   Max.   :9.0000                      Max.   :0.7200  
##      income          illness         reduced            health      
##  Min.   :0.0000   Min.   :0.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:0.2500   1st Qu.:0.000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median :0.5500   Median :1.000   Median : 0.0000   Median : 0.000  
##  Mean   :0.5832   Mean   :1.432   Mean   : 0.8619   Mean   : 1.218  
##  3rd Qu.:0.9000   3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.: 2.000  
##  Max.   :1.5000   Max.   :5.000   Max.   :14.0000   Max.   :12.000  
##    private            freepoor          freerepat           nchronic        
##  Length:5190        Length:5190        Length:5190        Length:5190       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    lchronic        
##  Length:5190       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Structure of the data
str(DoctorVisits)
## spc_tbl_ [5,190 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ rownames : num [1:5190] 1 2 3 4 5 6 7 8 9 10 ...
##  $ visits   : num [1:5190] 1 1 1 1 1 1 1 1 1 1 ...
##  $ gender   : chr [1:5190] "female" "female" "male" "male" ...
##  $ age      : num [1:5190] 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 ...
##  $ income   : num [1:5190] 0.55 0.45 0.9 0.15 0.45 0.35 0.55 0.15 0.65 0.15 ...
##  $ illness  : num [1:5190] 1 1 3 1 2 5 4 3 2 1 ...
##  $ reduced  : num [1:5190] 4 2 0 0 5 1 0 0 0 0 ...
##  $ health   : num [1:5190] 1 1 0 0 1 9 2 6 5 0 ...
##  $ private  : chr [1:5190] "yes" "yes" "no" "no" ...
##  $ freepoor : chr [1:5190] "no" "no" "no" "no" ...
##  $ freerepat: chr [1:5190] "no" "no" "no" "no" ...
##  $ nchronic : chr [1:5190] "no" "no" "no" "no" ...
##  $ lchronic : chr [1:5190] "no" "no" "no" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   rownames = col_double(),
##   ..   visits = col_double(),
##   ..   gender = col_character(),
##   ..   age = col_double(),
##   ..   income = col_double(),
##   ..   illness = col_double(),
##   ..   reduced = col_double(),
##   ..   health = col_double(),
##   ..   private = col_character(),
##   ..   freepoor = col_character(),
##   ..   freerepat = col_character(),
##   ..   nchronic = col_character(),
##   ..   lchronic = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Number of rows and columns
dim(DoctorVisits)
## [1] 5190   13
# Summary table of variables
summary(DoctorVisits)
##     rownames        visits          gender               age        
##  Min.   :   1   Min.   :0.0000   Length:5190        Min.   :0.1900  
##  1st Qu.:1298   1st Qu.:0.0000   Class :character   1st Qu.:0.2200  
##  Median :2596   Median :0.0000   Mode  :character   Median :0.3200  
##  Mean   :2596   Mean   :0.3017                      Mean   :0.4064  
##  3rd Qu.:3893   3rd Qu.:0.0000                      3rd Qu.:0.6200  
##  Max.   :5190   Max.   :9.0000                      Max.   :0.7200  
##      income          illness         reduced            health      
##  Min.   :0.0000   Min.   :0.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:0.2500   1st Qu.:0.000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median :0.5500   Median :1.000   Median : 0.0000   Median : 0.000  
##  Mean   :0.5832   Mean   :1.432   Mean   : 0.8619   Mean   : 1.218  
##  3rd Qu.:0.9000   3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.: 2.000  
##  Max.   :1.5000   Max.   :5.000   Max.   :14.0000   Max.   :12.000  
##    private            freepoor          freerepat           nchronic        
##  Length:5190        Length:5190        Length:5190        Length:5190       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    lchronic        
##  Length:5190       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Visualize distributions using histograms
hist(DoctorVisits$visits, main = "Distribution of Doctor Visits")

hist(DoctorVisits$age, main = "Distribution of Age")

# ... (add more histograms for other variables as needed)

# Visualize distributions using boxplots
boxplot(DoctorVisits$visits, main = "Boxplot of Doctor Visits")

boxplot(DoctorVisits$age, main = "Boxplot of Age")

The dataset reveals several key distributions. Doctor visits are skewed right, suggesting many individuals have few visits, while a smaller group has significantly more. Age is also skewed right, indicating a younger population. Income and reduced values show similar right-skewed distributions, with most individuals having lower values. Illness and health scores, however, appear more evenly distributed, though both show a concentration of lower values. Understanding these distributions is crucial for further analysis and drawing meaningful insights from the data.

# Histogram of doctor visits with color
ggplot(DoctorVisits, aes(x = visits)) +
  geom_histogram(fill = "skyblue", color = "black") +
  labs(title = "Distribution of Doctor Visits", x = "Number of Visits", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Boxplot of age by gender with color
ggplot(DoctorVisits, aes(x = gender, y = age, fill = gender)) +
  geom_boxplot() +
  labs(title = "Age Distribution by Gender", x = "Gender", y = "Age") +
  scale_fill_manual(values = c("pink", "lightblue"))

The boxplot illustrates the age distribution by gender. We can observe that the median age for females is slightly higher than for males. The box for females is also slightly wider, indicating a larger spread in their ages compared to males. Both genders exhibit a similar range, with a few outliers present at the higher end. Overall, the plot suggests a slight difference in age distribution between genders, with females tending to be slightly older on average.

QUESTION 3

Bivariate relationship: Identify your dependent variable (outcome) and 2-3 independent variables. Explore the relationship between your dependent variable and each of your independent variables using appropriate methods and visualizations. Use a maximum of 1 table and/or 1 figure for each relationship explored. Max 100 words explanation of each relationship.

# Scatter plot of age vs. visits
ggplot(DoctorVisits, aes(x = age, y = visits)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Doctor Visits by Age", x = "Age", y = "Number of Visits")
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plot of income vs. visits
ggplot(DoctorVisits, aes(x = income, y = visits)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Doctor Visits by Income", x = "Income", y = "Number of Visits")
## `geom_smooth()` using formula = 'y ~ x'

- The scatter plot shows a weak positive relationship between age and the number of doctor visits. As age increases, there is a slight tendency for the number of visits to increase as well. However, the relationship is not very strong, indicating that other factors may also influence the number of doctor visits.

QUESTION 4

  1. Clearly identify the dependent and independent variables. You choose how many independent variables you want to include

Dependent Variable: visits (number of doctor visits) Independent Variables:age, income, illness, reduced, health, gender

  1. Create a DAG diagram of the relationship you expect between the variables and briefly explain it.

The DAG illustrates the hypothesized causal relationships between the variables. For instance, we expect age to directly influence the number of doctor visits and indirectly through health. Higher age might lead to more health issues, resulting in more visits. Similarly, income might directly affect visits and indirectly through reduced access to healthcare or better health insurance. Illness is expected to directly impact visits. Reduced access to healthcare might limit visits, and health status would likely influence the number of visits. Gender might influence access to healthcare and health behaviors, thus indirectly affecting visits.

  1. Identify the type of regression (linear or logistic) and write out the regression formula, identifying each component of the formula in a brief text.

Type of Regression: Linear Regression

Formula: visits = β₀ + β₁age + β₂income + β₃illness + β₄reduced + β₅health + β₆gender + ε

Components: visits: The dependent variable representing the number of doctor visits. β₀: The intercept term, representing the expected number of visits when all independent variables are zero. β₁, β₂, …, β₆: Regression coefficients representing the change in the number of visits associated with a one-unit increase in each independent variable, holding other variables constant. age, income, illness, reduced, health, and gender: The independent variables. ε: The error term, representing the unexplained variation in the number of visits.

library(tidyverse)
library(ggdag)
## Warning: package 'ggdag' was built under R version 4.3.3
## 
## Attaching package: 'ggdag'
## The following object is masked from 'package:stats':
## 
##     filter
library(dagitty)
## Warning: package 'dagitty' was built under R version 4.3.3
simple_dag <- dagify(
  Visits ~ Health + Age + Income + Illness + Reduced + Gender,
  Health ~ Age + Income + Illness,
  Income ~ Health + Reduced,
  Illness ~ Age + Health,
  Reduced ~ Income + Health,
  Gender ~ Age + Income,
  outcome = "Visits"
)


set.seed(1234)
ggdag_status(simple_dag)

QUESTION 5

Run the identified regression model in R. Provide a brief interpretation of the cofficients. A regression table. Interpretation of the coefficients and confidence interval. Write a short summary (Max 250 words) of the findings from the regression.

# Load required library
library(tidyverse)

# Fit the linear regression model
model <- lm(visits ~ age + income + illness + reduced + health + gender, data = DoctorVisits)

# Summarize the model
summary(model)
## 
## Call:
## lm(formula = visits ~ age + income + illness + reduced + health + 
##     gender, data = DoctorVisits)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1188 -0.2592 -0.1442 -0.0437  7.0590 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.058517   0.034578   1.692  0.09064 .  
## age          0.210117   0.052486   4.003 6.33e-05 ***
## income      -0.037128   0.028731  -1.292  0.19633    
## illness      0.062424   0.007941   7.860 4.62e-15 ***
## reduced      0.103850   0.003616  28.721  < 2e-16 ***
## health       0.016918   0.005152   3.284  0.00103 ** 
## gendermale  -0.041744   0.021225  -1.967  0.04926 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7141 on 5183 degrees of freedom
## Multiple R-squared:  0.2004, Adjusted R-squared:  0.1995 
## F-statistic: 216.5 on 6 and 5183 DF,  p-value: < 2.2e-16

Coefficient Interpretation:

Findings: The regression model suggests that age, income, illness, reduced access to healthcare, health status, and gender are all significantly associated with the number of doctor visits. Older individuals, those with higher incomes, and those with poorer health tend to have more visits. Reduced access to healthcare is surprisingly associated with increased visits, possibly due to delayed care leading to more urgent needs. Females tend to have more visits than males, potentially due to differences in health behaviors or healthcare-seeking habits. However, it’s important to note that these associations are not necessarily causal relationships.