rm(list=ls())
data <- read.csv("C:/Users/tenuu/OneDrive/Desktop/DoctorVisits.csv")

visits-Number of doctor visits in past 2 weeks. gender-Factor indicating gender. age-Age in years divided by 100. income-Annual income in tens of thousands of dollars. illness-Number of illnesses in past 2 weeks. reduced-Number of days of reduced activity in past 2 weeks due to illness or injury. health-General health questionnaire score using Goldberg’s method. private-Factor. Does the individual have private health insurance? freepoor-Factor. Does the individual have free government health insurance due to low income? freerepat-Factor. Does the individual have free government health insurance due to old age, disability or veteran status? nchronic-Factor. Is there a chronic condition not limiting activity? lchronic-Factor. Is there a chronic condition limiting activity?

data$private[data$private=='yes'] <- '1'
data$private[data$private=='no'] <- '0'
data$freepoor[data$freepoor=='yes'] <- '1'
data$freepoor[data$freepoor=='no'] <- '0'
data$freerepat[data$freerepat=='yes'] <- '1'
data$freerepat[data$freerepat=='no'] <- '0'
data$nchronic[data$nchronic=='yes'] <- '1'
data$nchronic[data$nchronic=='no'] <- '0'
data$lchronic[data$lchronic=='yes'] <- '1'
data$lchronic[data$lchronic=='no'] <- '0'
str(data)
## 'data.frame':    5190 obs. of  12 variables:
##  $ visits   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ gender   : chr  "female" "female" "male" "male" ...
##  $ age      : num  0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 0.19 ...
##  $ income   : num  0.55 0.45 0.9 0.15 0.45 0.35 0.55 0.15 0.65 0.15 ...
##  $ illness  : int  1 1 3 1 2 5 4 3 2 1 ...
##  $ reduced  : int  4 2 0 0 5 1 0 0 0 0 ...
##  $ health   : int  1 1 0 0 1 9 2 6 5 0 ...
##  $ private  : chr  "1" "1" "0" "0" ...
##  $ freepoor : chr  "0" "0" "0" "0" ...
##  $ freerepat: chr  "0" "0" "0" "0" ...
##  $ nchronic : chr  "0" "0" "0" "0" ...
##  $ lchronic : chr  "0" "0" "0" "0" ...
summary_stats <- data.frame(
  Variable = c("visits", "age", "income", "illness", "reduced", "health"),
  Standard_Deviation = c(sd(data$visits), sd(data$age), sd(data$income), 
                         sd(data$illness), sd(data$reduced), sd(data$health)),
  Variance = c(var(data$visits), var(data$age), var(data$income), 
               var(data$illness), var(data$reduced), var(data$health))
)
print(summary_stats)
##   Variable Standard_Deviation   Variance
## 1   visits          0.7981338 0.63701761
## 2      age          0.2047818 0.04193559
## 3   income          0.3689067 0.13609215
## 4  illness          1.3841524 1.91587793
## 5  reduced          2.8876284 8.33839781
## 6   health          2.1242665 4.51250808
summary(data)
##      visits          gender               age             income      
##  Min.   :0.0000   Length:5190        Min.   :0.1900   Min.   :0.0000  
##  1st Qu.:0.0000   Class :character   1st Qu.:0.2200   1st Qu.:0.2500  
##  Median :0.0000   Mode  :character   Median :0.3200   Median :0.5500  
##  Mean   :0.3017                      Mean   :0.4064   Mean   :0.5832  
##  3rd Qu.:0.0000                      3rd Qu.:0.6200   3rd Qu.:0.9000  
##  Max.   :9.0000                      Max.   :0.7200   Max.   :1.5000  
##     illness         reduced            health         private         
##  Min.   :0.000   Min.   : 0.0000   Min.   : 0.000   Length:5190       
##  1st Qu.:0.000   1st Qu.: 0.0000   1st Qu.: 0.000   Class :character  
##  Median :1.000   Median : 0.0000   Median : 0.000   Mode  :character  
##  Mean   :1.432   Mean   : 0.8619   Mean   : 1.218                     
##  3rd Qu.:2.000   3rd Qu.: 0.0000   3rd Qu.: 2.000                     
##  Max.   :5.000   Max.   :14.0000   Max.   :12.000                     
##    freepoor          freerepat           nchronic           lchronic        
##  Length:5190        Length:5190        Length:5190        Length:5190       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
if (!require(ggplot2)) install.packages("ggplot2")
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
if (!require(dplyr)) install.packages("dplyr")
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
gender_counts <- data %>%
  group_by(gender) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)

pie_chart <- ggplot(gender_counts, aes(x = "", y = percentage, fill = gender)) + 
  geom_bar(stat = "identity", width = 1) + 
  coord_polar("y", start = 0) + 
  labs(title = "Pie Chart: Gender Distribution") + 
  theme_void() + 
  theme(legend.title = element_blank()) +  
  geom_text(aes(label = paste0(round(percentage, 1), "%")), 
            position = position_stack(vjust = 0.5),  
            color = "black", size = 6)  

print(pie_chart)

install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = income)) + 
  geom_point() +  
  geom_smooth(method = "lm", col = "blue") +  
  ggtitle("Regression Plot: Visits vs Income") +  
  xlab("Visits") + 
  ylab("Income")  
## `geom_smooth()` using formula = 'y ~ x'

install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = age)) + 
  geom_point() + 
  geom_smooth(method = "lm", col = "blue") +  
  ggtitle("Regression Plot: Visits vs Age") +  
  xlab("Visits") + 
  ylab("Age")  
## `geom_smooth()` using formula = 'y ~ x'

install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = illness)) + 
  geom_point() +  
  geom_smooth(method = "lm", col = "blue") +  
  ggtitle("Regression Plot: Visits vs Illness") +  
  xlab("Visits") + 
  ylab("Illness")  
## `geom_smooth()` using formula = 'y ~ x'

install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
ggplot(data, aes(x = visits, y = health)) + 
  geom_point() +  
  geom_smooth(method = "lm", col = "blue") +  
  ggtitle("Regression Plot: Visits vs Health") +  
  xlab("Visits") + 
  ylab("Health")  
## `geom_smooth()` using formula = 'y ~ x'

if (!require(car)) install.packages("car")
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(car)
model <- lm(visits ~ income + age + illness + reduced + health, data = data)
summary(model)
## 
## Call:
## lm(formula = visits ~ income + age + illness + reduced + health, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1133 -0.2532 -0.1408 -0.0544  7.0798 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.035222   0.032495   1.084 0.278452    
## income      -0.048965   0.028102  -1.742 0.081491 .  
## age          0.231846   0.051325   4.517 6.40e-06 ***
## illness      0.063252   0.007932   7.974 1.88e-15 ***
## reduced      0.103807   0.003617  28.702  < 2e-16 ***
## health       0.017089   0.005152   3.317 0.000917 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7143 on 5184 degrees of freedom
## Multiple R-squared:  0.1998, Adjusted R-squared:  0.199 
## F-statistic: 258.9 on 5 and 5184 DF,  p-value: < 2.2e-16
vif(model)
##   income      age  illness  reduced   health 
## 1.092993 1.123451 1.226035 1.109279 1.218276
cor(data[c("visits", "age", "income", "illness", "reduced", "health")])
##              visits         age      income    illness     reduced      health
## visits   1.00000000  0.12453676 -0.07683983  0.2235524  0.41895444  0.19327156
## age      0.12453676  1.00000000 -0.27107338  0.2049839  0.09474494  0.01861580
## income  -0.07683983 -0.27107338  1.00000000 -0.1488116 -0.04754529 -0.08579045
## illness  0.22355244  0.20498389 -0.14881155  1.0000000  0.21811627  0.36010981
## reduced  0.41895444  0.09474494 -0.04754529  0.2181163  1.00000000  0.28020818
## health   0.19327156  0.01861580 -0.08579045  0.3601098  0.28020818  1.00000000
library(ggplot2)
install.packages("ggplot2")  
## Warning: package 'ggplot2' is in use and will not be installed
install.packages("dplyr")    
## Warning: package 'dplyr' is in use and will not be installed
freq_table <- table(data$visits)
print(freq_table)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 4141  782  174   30   24    9   12   12    5    1
freq_df <- as.data.frame(freq_table)
colnames(freq_df) <- c("Visits", "Frequency")
print(freq_df)
##    Visits Frequency
## 1       0      4141
## 2       1       782
## 3       2       174
## 4       3        30
## 5       4        24
## 6       5         9
## 7       6        12
## 8       7        12
## 9       8         5
## 10      9         1
data$fraction = freq_df$Frequency / sum(freq_df$Frequency)

data$ymax = cumsum(freq_df$Frequency)

data$ymin = c(0, head(data$ymax, n=-1))

ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=fraction)) +
     geom_rect() +
     coord_polar(theta="y") + 
     xlim(c(2, 4)) 

freq_table <- table(data$visits)
print(freq_table)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 4141  782  174   30   24    9   12   12    5    1
freq_df <- as.data.frame(freq_table)
colnames(freq_df) <- c("Visits", "Frequency")
print(freq_df)
##    Visits Frequency
## 1       0      4141
## 2       1       782
## 3       2       174
## 4       3        30
## 5       4        24
## 6       5         9
## 7       6        12
## 8       7        12
## 9       8         5
## 10      9         1
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)

freq_df <- data.frame(
  visits = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"),
  frequency = c(4141, 782, 174, 30, 24, 9, 12, 15, 5, 1)
)

ggplot(freq_df, aes(x=visits, y=frequency)) +
  geom_segment(aes(x=visits, xend=visits, y=0, yend=frequency), color="coral") +  
  geom_point(color="red", size=4, alpha=0.6) +  
  theme_light() +
  coord_flip() + 
  theme(
    panel.grid.major.y = element_blank(),  
    panel.border = element_blank(),      
    axis.ticks.y = element_blank()      
  )