rm(list=ls())
data <- read.csv("C:/Users/tenuu/OneDrive/Desktop/DoctorVisits.csv")
visits-Number of doctor visits in past 2 weeks. gender-Factor indicating gender. age-Age in years divided by 100. income-Annual income in tens of thousands of dollars. illness-Number of illnesses in past 2 weeks. reduced-Number of days of reduced activity in past 2 weeks due to illness or injury. health-General health questionnaire score using Goldberg’s method. private-Factor. Does the individual have private health insurance? freepoor-Factor. Does the individual have free government health insurance due to low income? freerepat-Factor. Does the individual have free government health insurance due to old age, disability or veteran status? nchronic-Factor. Is there a chronic condition not limiting activity? lchronic-Factor. Is there a chronic condition limiting activity?
data$private[data$private=='yes'] <- '1'
data$private[data$private=='no'] <- '0'
data$freepoor[data$freepoor=='yes'] <- '1'
data$freepoor[data$freepoor=='no'] <- '0'
data$freerepat[data$freerepat=='yes'] <- '1'
data$freerepat[data$freerepat=='no'] <- '0'
data$nchronic[data$nchronic=='yes'] <- '1'
data$nchronic[data$nchronic=='no'] <- '0'
data$lchronic[data$lchronic=='yes'] <- '1'
data$lchronic[data$lchronic=='no'] <- '0'
str(data)
## 'data.frame': 5190 obs. of 12 variables:
## $ visits : int 1 1 1 1 1 1 1 1 1 1 ...
## $ gender : chr "female" "female" "male" "male" ...
## $ age : num 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 ...
## $ income : num 0.55 0.45 0.9 0.15 0.45 0.35 0.55 0.15 0.65 0.15 ...
## $ illness : int 1 1 3 1 2 5 4 3 2 1 ...
## $ reduced : int 4 2 0 0 5 1 0 0 0 0 ...
## $ health : int 1 1 0 0 1 9 2 6 5 0 ...
## $ private : chr "1" "1" "0" "0" ...
## $ freepoor : chr "0" "0" "0" "0" ...
## $ freerepat: chr "0" "0" "0" "0" ...
## $ nchronic : chr "0" "0" "0" "0" ...
## $ lchronic : chr "0" "0" "0" "0" ...
summary_stats <- data.frame(
Variable = c("visits", "age", "income", "illness", "reduced", "health"),
Standard_Deviation = c(sd(data$visits), sd(data$age), sd(data$income),
sd(data$illness), sd(data$reduced), sd(data$health)),
Variance = c(var(data$visits), var(data$age), var(data$income),
var(data$illness), var(data$reduced), var(data$health))
)
print(summary_stats)
## Variable Standard_Deviation Variance
## 1 visits 0.7981338 0.63701761
## 2 age 0.2047818 0.04193559
## 3 income 0.3689067 0.13609215
## 4 illness 1.3841524 1.91587793
## 5 reduced 2.8876284 8.33839781
## 6 health 2.1242665 4.51250808
summary(data)
## visits gender age income
## Min. :0.0000 Length:5190 Min. :0.1900 Min. :0.0000
## 1st Qu.:0.0000 Class :character 1st Qu.:0.2200 1st Qu.:0.2500
## Median :0.0000 Mode :character Median :0.3200 Median :0.5500
## Mean :0.3017 Mean :0.4064 Mean :0.5832
## 3rd Qu.:0.0000 3rd Qu.:0.6200 3rd Qu.:0.9000
## Max. :9.0000 Max. :0.7200 Max. :1.5000
## illness reduced health private
## Min. :0.000 Min. : 0.0000 Min. : 0.000 Length:5190
## 1st Qu.:0.000 1st Qu.: 0.0000 1st Qu.: 0.000 Class :character
## Median :1.000 Median : 0.0000 Median : 0.000 Mode :character
## Mean :1.432 Mean : 0.8619 Mean : 1.218
## 3rd Qu.:2.000 3rd Qu.: 0.0000 3rd Qu.: 2.000
## Max. :5.000 Max. :14.0000 Max. :12.000
## freepoor freerepat nchronic lchronic
## Length:5190 Length:5190 Length:5190 Length:5190
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
if (!require(ggplot2)) install.packages("ggplot2")
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
if (!require(dplyr)) install.packages("dplyr")
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
gender_counts <- data %>%
group_by(gender) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count) * 100)
pie_chart <- ggplot(gender_counts, aes(x = "", y = percentage, fill = gender)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
labs(title = "Pie Chart: Gender Distribution") +
theme_void() +
theme(legend.title = element_blank()) +
geom_text(aes(label = paste0(round(percentage, 1), "%")),
position = position_stack(vjust = 0.5),
color = "black", size = 6)
print(pie_chart)
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = income)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Regression Plot: Visits vs Income") +
xlab("Visits") +
ylab("Income")
## `geom_smooth()` using formula = 'y ~ x'
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = age)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Regression Plot: Visits vs Age") +
xlab("Visits") +
ylab("Age")
## `geom_smooth()` using formula = 'y ~ x'
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = illness)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Regression Plot: Visits vs Illness") +
xlab("Visits") +
ylab("Illness")
## `geom_smooth()` using formula = 'y ~ x'
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = health)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Regression Plot: Visits vs Health") +
xlab("Visits") +
ylab("Health")
## `geom_smooth()` using formula = 'y ~ x'
if (!require(car)) install.packages("car")
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(car)
model <- lm(visits ~ income + age + illness + reduced + health, data = data)
summary(model)
##
## Call:
## lm(formula = visits ~ income + age + illness + reduced + health,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1133 -0.2532 -0.1408 -0.0544 7.0798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.035222 0.032495 1.084 0.278452
## income -0.048965 0.028102 -1.742 0.081491 .
## age 0.231846 0.051325 4.517 6.40e-06 ***
## illness 0.063252 0.007932 7.974 1.88e-15 ***
## reduced 0.103807 0.003617 28.702 < 2e-16 ***
## health 0.017089 0.005152 3.317 0.000917 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7143 on 5184 degrees of freedom
## Multiple R-squared: 0.1998, Adjusted R-squared: 0.199
## F-statistic: 258.9 on 5 and 5184 DF, p-value: < 2.2e-16
vif(model)
## income age illness reduced health
## 1.092993 1.123451 1.226035 1.109279 1.218276
cor(data[c("visits", "age", "income", "illness", "reduced", "health")])
## visits age income illness reduced health
## visits 1.00000000 0.12453676 -0.07683983 0.2235524 0.41895444 0.19327156
## age 0.12453676 1.00000000 -0.27107338 0.2049839 0.09474494 0.01861580
## income -0.07683983 -0.27107338 1.00000000 -0.1488116 -0.04754529 -0.08579045
## illness 0.22355244 0.20498389 -0.14881155 1.0000000 0.21811627 0.36010981
## reduced 0.41895444 0.09474494 -0.04754529 0.2181163 1.00000000 0.28020818
## health 0.19327156 0.01861580 -0.08579045 0.3601098 0.28020818 1.00000000
library(ggplot2)
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
install.packages("dplyr")
## Warning: package 'dplyr' is in use and will not be installed
freq_table <- table(data$visits)
print(freq_table)
##
## 0 1 2 3 4 5 6 7 8 9
## 4141 782 174 30 24 9 12 12 5 1
freq_df <- as.data.frame(freq_table)
colnames(freq_df) <- c("Visits", "Frequency")
print(freq_df)
## Visits Frequency
## 1 0 4141
## 2 1 782
## 3 2 174
## 4 3 30
## 5 4 24
## 6 5 9
## 7 6 12
## 8 7 12
## 9 8 5
## 10 9 1
data$fraction = freq_df$Frequency / sum(freq_df$Frequency)
data$ymax = cumsum(freq_df$Frequency)
data$ymin = c(0, head(data$ymax, n=-1))
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=fraction)) +
geom_rect() +
coord_polar(theta="y") +
xlim(c(2, 4))
freq_table <- table(data$visits)
print(freq_table)
##
## 0 1 2 3 4 5 6 7 8 9
## 4141 782 174 30 24 9 12 12 5 1
freq_df <- as.data.frame(freq_table)
colnames(freq_df) <- c("Visits", "Frequency")
print(freq_df)
## Visits Frequency
## 1 0 4141
## 2 1 782
## 3 2 174
## 4 3 30
## 5 4 24
## 6 5 9
## 7 6 12
## 8 7 12
## 9 8 5
## 10 9 1
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
freq_df <- data.frame(
visits = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"),
frequency = c(4141, 782, 174, 30, 24, 9, 12, 15, 5, 1)
)
ggplot(freq_df, aes(x=visits, y=frequency)) +
geom_segment(aes(x=visits, xend=visits, y=0, yend=frequency), color="coral") +
geom_point(color="red", size=4, alpha=0.6) +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
)