Descriptive Statistics
library(readxl)
ratemdsfinal <- read_excel("ratemdsfinal.xlsx")
## New names:
## * `` -> ...1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(xtable)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(knitr)
library(psych)
colSums(is.na(ratemdsfinal))
## ...1 Doctor_names
## 0 0
## Gender Years of Experience
## 0 59
## Doctor_reviews Rating
## 0 0
## Review_sentences Count of Sentiment
## 0 0
## Sum of Pos Score Count of Positive Sentiment
## 0 0
## Positive_Proportion Average pos_score
## 0 0
## Sum of Neg Score Negative_Proportion
## 0 0
## Count of Negative Sentiment Average neg_score
## 0 0
## Communication_positive Expertisepositive
## 0 0
## Timepositive Bedside_positive
## 0 0
## Officepositive Costpositive
## 0 0
## Communication_negative Expertisenegative
## 0 0
## Time_negative Bedside_negative
## 0 0
## Office_negative Cost_negative
## 0 0
## Word Count Review Count
## 0 0
## Phrase Count Average words per review
## 0 0
## Overall_score
## 0
ratemdsfinal$`Years of Experience`[is.na(ratemdsfinal$`Years of Experience`)]=round(mean(ratemdsfinal$`Years of Experience`,na.rm = T))
# ghetting only the numeric columsn
num_cols <- unlist(lapply(ratemdsfinal, is.numeric))
rate <- ratemdsfinal[, num_cols]
rate$...1<- NULL
str(rate)
## tibble [793 × 28] (S3: tbl_df/tbl/data.frame)
## $ Years of Experience : num [1:793] 21 15 26 21 27 38 49 36 25 26 ...
## $ Rating : num [1:793] 4.85 4.47 4.92 4.89 4.89 5 4.44 4.75 4.93 4.78 ...
## $ Count of Sentiment : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
## $ Sum of Pos Score : num [1:793] 9.81 18.01 7.11 8.41 6.54 ...
## $ Count of Positive Sentiment: num [1:793] 12 21 8 9 9 7 15 10 7 8 ...
## $ Positive_Proportion : num [1:793] 0.857 0.913 0.889 1 1 ...
## $ Average pos_score : num [1:793] 0.817 0.858 0.889 0.934 0.726 ...
## $ Sum of Neg Score : num [1:793] -0.9394 -0.6902 -0.0018 0 0 ...
## $ Negative_Proportion : num [1:793] 0.143 0.087 0.111 0 0 ...
## $ Count of Negative Sentiment: num [1:793] 2 2 1 0 0 0 2 1 0 0 ...
## $ Average neg_score : num [1:793] -0.4697 -0.3451 -0.0018 0 0 ...
## $ Communication_positive : num [1:793] 0.0323 0.0876 0 0.0513 0.1 ...
## $ Expertisepositive : num [1:793] 0.145 0.073 0.139 0.103 0.267 ...
## $ Timepositive : num [1:793] 0.0806 0.0438 0.0556 0.1538 0.1667 ...
## $ Bedside_positive : num [1:793] 0 0.19 0.194 0.103 0.167 ...
## $ Officepositive : num [1:793] 0.0645 0.0949 0.1111 0.0513 0.0333 ...
## $ Costpositive : num [1:793] 0.0161 0.073 0.0833 0 0.0667 ...
## $ Communication_negative : num [1:793] 0.0484 0.0438 0.0278 0 0 ...
## $ Expertisenegative : num [1:793] 0.0645 0.0219 0.0278 0 0.0333 ...
## $ Time_negative : num [1:793] 0.0968 0.0511 0 0 0 ...
## $ Bedside_negative : num [1:793] 0.0161 0.0219 0 0 0 ...
## $ Office_negative : num [1:793] 0.0323 0.0365 0 0 0 ...
## $ Cost_negative : num [1:793] 0 0.0219 0 0 0 ...
## $ Word Count : num [1:793] 523 1604 288 358 280 ...
## $ Review Count : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
## $ Phrase Count : num [1:793] 67 165 38 42 48 32 133 93 41 67 ...
## $ Average words per review : num [1:793] 37 70 32 40 31 36 69 100 38 77 ...
## $ Overall_score : num [1:793] 0.348 0.513 0.887 0.934 0.726 ...
names(rate)[1]<- "Exp"
# Print Descriptive Statistics for RateMDs
kable(xtable(describe(rate)[c(3,4,5,8,9,11)]))
|
|
mean
|
sd
|
median
|
min
|
max
|
skew
|
|
Exp
|
25.9394704
|
10.7023017
|
26.0000000
|
2.0000
|
56.0000
|
0.2925266
|
|
Rating
|
3.9887516
|
1.1456304
|
4.5000000
|
1.0000
|
5.0000
|
-1.0690984
|
|
Count of Sentiment
|
2.9621690
|
3.2622855
|
2.0000000
|
1.0000
|
30.0000
|
3.0905276
|
|
Sum of Pos Score
|
1.6952861
|
2.0505289
|
0.9493000
|
0.0000
|
18.0120
|
2.9120548
|
|
Count of Positive Sentiment
|
2.1601513
|
2.5047079
|
1.0000000
|
0.0000
|
21.0000
|
2.8523571
|
|
Positive_Proportion
|
0.7329744
|
0.3628249
|
1.0000000
|
0.0000
|
1.0000
|
-1.0791118
|
|
Average pos_score
|
0.6597428
|
0.3194366
|
0.7778000
|
0.0000
|
0.9923
|
-1.1775197
|
|
Sum of Neg Score
|
-0.4450890
|
0.8923222
|
0.0000000
|
-10.8437
|
0.0000
|
-4.3611618
|
|
Negative_Proportion
|
0.2263399
|
0.3439964
|
0.0000000
|
0.0000
|
1.0000
|
1.3315686
|
|
Count of Negative Sentiment
|
0.6935687
|
1.2951625
|
0.0000000
|
0.0000
|
15.0000
|
4.1368418
|
|
Average neg_score
|
-0.2453372
|
0.3380014
|
0.0000000
|
-0.9810
|
0.0000
|
-0.8781404
|
|
Communication_positive
|
0.0519260
|
0.1105353
|
0.0000000
|
0.0000
|
1.0000
|
4.0196863
|
|
Expertisepositive
|
0.1172052
|
0.1680621
|
0.0666667
|
0.0000
|
1.0000
|
2.6532162
|
|
Timepositive
|
0.0530472
|
0.1028384
|
0.0000000
|
0.0000
|
1.0000
|
3.2624407
|
|
Bedside_positive
|
0.1393430
|
0.1792870
|
0.1000000
|
0.0000
|
1.0000
|
2.2700317
|
|
Officepositive
|
0.0567404
|
0.1138064
|
0.0000000
|
0.0000
|
1.0000
|
3.5556046
|
|
Costpositive
|
0.0388190
|
0.0780689
|
0.0000000
|
0.0000
|
0.5000
|
2.6948463
|
|
Communication_negative
|
0.0427782
|
0.0923958
|
0.0000000
|
0.0000
|
1.0000
|
3.9127936
|
|
Expertisenegative
|
0.0620597
|
0.1284017
|
0.0000000
|
0.0000
|
1.0000
|
3.9608482
|
|
Time_negative
|
0.0499659
|
0.1058386
|
0.0000000
|
0.0000
|
1.0000
|
3.8477383
|
|
Bedside_negative
|
0.0335470
|
0.1045708
|
0.0000000
|
0.0000
|
1.0000
|
6.1851972
|
|
Office_negative
|
0.0337728
|
0.0817816
|
0.0000000
|
0.0000
|
0.6000
|
3.6060950
|
|
Cost_negative
|
0.0140710
|
0.0598381
|
0.0000000
|
0.0000
|
1.0000
|
8.8894980
|
|
Word Count
|
169.6973518
|
250.4917975
|
78.0000000
|
0.0000
|
2350.0000
|
3.5365532
|
|
Review Count
|
2.9621690
|
3.2622855
|
2.0000000
|
1.0000
|
30.0000
|
3.0905276
|
|
Phrase Count
|
17.4186633
|
23.0809676
|
9.0000000
|
1.0000
|
195.0000
|
3.4406515
|
|
Average words per review
|
52.6557377
|
53.6781351
|
40.0000000
|
0.0000
|
737.0000
|
5.8080336
|
|
Overall_score
|
0.4144056
|
0.5385908
|
0.6249000
|
-0.9810
|
0.9914
|
-0.9302287
|
# Checking for Normality
library(ggpubr)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: magrittr
ggqqplot(rate$Rating)

shapiro.test(rate$Rating)
##
## Shapiro-Wilk normality test
##
## data: rate$Rating
## W = 0.83136, p-value < 2.2e-16
ggqqplot(rate$Positive_Proportion)

shapiro.test(rate$Positive_Proportion)
##
## Shapiro-Wilk normality test
##
## data: rate$Positive_Proportion
## W = 0.72269, p-value < 2.2e-16
ggqqplot(rate$Negative_Proportion)

shapiro.test(rate$Negative_Proportion)
##
## Shapiro-Wilk normality test
##
## data: rate$Negative_Proportion
## W = 0.67902, p-value < 2.2e-16
ggqqplot(rate$`Average pos_score`)

shapiro.test(rate$`Average pos_score`)
##
## Shapiro-Wilk normality test
##
## data: rate$`Average pos_score`
## W = 0.78975, p-value < 2.2e-16
ggqqplot(rate$`Average neg_score`)

shapiro.test(rate$`Average neg_score`)
##
## Shapiro-Wilk normality test
##
## data: rate$`Average neg_score`
## W = 0.7132, p-value < 2.2e-16
## Multiple t-tests for RateMDs
a<- lapply(rate[], function(x) t.test(x ~ ratemdsfinal$Gender, var.equal = TRUE))
Statistics based on Gender
#str(ratemdsfinal)
ratemdsfinal$Gender<- factor(ratemdsfinal$Gender)
rate %>% #plot rating over Gender
ggplot(aes(x=Rating,fill=ratemdsfinal$Gender))+
geom_density(alpha=.4,position="identity")+
labs(title = "Density of Rating by Gender on RateMDs\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Positive_Proportion over Gender
ggplot(aes(x=Positive_Proportion,fill=ratemdsfinal$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Positive Proportion by Gender on RateMDs\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Negative_Proportion over Gender
ggplot(aes(x=Negative_Proportion,fill=ratemdsfinal$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Negative_Proportion by Gender on RateMDs\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot `Average neg_score` over Gender
ggplot(aes(x=`Average neg_score`,fill=ratemdsfinal$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of `Average neg_score` by Gender on RateMDs\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot `Average pos_score` over Gender
ggplot(aes(x=`Average pos_score`,fill=ratemdsfinal$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of `Average pos_score` by Gender on RateMDs\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Years of Experience over Gender
ggplot(aes(x=Exp,fill=ratemdsfinal$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Years of Experience by Gender on RateMDs\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Overall_score over Gender
ggplot(aes(x=Overall_score,fill=ratemdsfinal$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Overall_score by Gender on RateMDs\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Statistics by No of Years of Experience
colSums(is.na(ratemdsfinal))
## ...1 Doctor_names
## 0 0
## Gender Years of Experience
## 0 0
## Doctor_reviews Rating
## 0 0
## Review_sentences Count of Sentiment
## 0 0
## Sum of Pos Score Count of Positive Sentiment
## 0 0
## Positive_Proportion Average pos_score
## 0 0
## Sum of Neg Score Negative_Proportion
## 0 0
## Count of Negative Sentiment Average neg_score
## 0 0
## Communication_positive Expertisepositive
## 0 0
## Timepositive Bedside_positive
## 0 0
## Officepositive Costpositive
## 0 0
## Communication_negative Expertisenegative
## 0 0
## Time_negative Bedside_negative
## 0 0
## Office_negative Cost_negative
## 0 0
## Word Count Review Count
## 0 0
## Phrase Count Average words per review
## 0 0
## Overall_score
## 0
r<- ratemdsfinal
summary(ratemdsfinal$`Years of Experience`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 19.00 26.00 25.94 33.00 56.00
ratemdsfinal$`Years of Experience` <- as.factor(ratemdsfinal$`Years of Experience`)
levels(ratemdsfinal$`Years of Experience`) = list("Less than 6 years" = c(0:6),
"7 to 14 years " = c(7:14),
"15 to 23 years" = c(15:23),
"24 and above" = c(24:56))
summary(ratemdsfinal$`Years of Experience`)
## Less than 6 years 7 to 14 years 15 to 23 years 24 and above
## 19 86 240 448
# shows how the rating is divided between No of years of experience of the doctor
ggplot(data = rate, mapping = aes(x =Rating)) +
geom_density(alpha=.4,position="identity")+
facet_wrap(~ ratemdsfinal$`Years of Experience`)+
ggtitle("Vistualising Years of Experience over Rating")+
xlab("CRating")+
theme_bw()

rate %>%
ggplot(aes(x=Rating,fill=ratemdsfinal$`Years of Experience`))+
geom_density(alpha=.7,position="identity")+
labs(title = "Density of Rating over Years of Experience on RateMDs",x="Rating Received on RateMDs",y = "Density of Rating")

str(ratemdsfinal)
## tibble [793 × 33] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:793] 0 1 2 3 4 5 6 7 8 9 ...
## $ Doctor_names : chr [1:793] "Dr. Martine T. Nelson" "Dr. Jason C. Tjaden" "Dr. Paul Tortoriello" "Dr. Maria Rosa" ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 1 1 2 2 2 1 ...
## $ Years of Experience : Factor w/ 4 levels "Less than 6 years",..: 3 3 4 3 4 4 4 4 4 4 ...
## $ Doctor_reviews : chr [1:793] "['Dr Nelson and her staff are wonderful! They are always on time and very caring!', 'Dr. Nelson was our family "| __truncated__ "['We love Dr. Tjaden! Our daughter had medical issues when she was born and he was incredibly supportive, affir"| __truncated__ "['Excellent doctor! Absolute best pediatrician. We switched from another local doctor who misdiagnosed my daugh"| __truncated__ "[\"Dr. Rosa is one of a kind. Genuine, caring and full of kindness. She's gentle with my girls and they LOVE he"| __truncated__ ...
## $ Rating : num [1:793] 4.85 4.47 4.92 4.89 4.89 5 4.44 4.75 4.93 4.78 ...
## $ Review_sentences : chr [1:793] "[\"['Dr Nelson and her staff are wonderful\", ' They are always on time and very caring', \", 'Dr\", ' Nelson w"| __truncated__ "[\"['We love Dr\", ' Tjaden', ' Our daughter had medical issues when she was born and he was incredibly support"| __truncated__ "[\"['Excellent doctor\", ' Absolute best pediatrician', ' We switched from another local doctor who misdiagnose"| __truncated__ "['[\"Dr', ' Rosa is one of a kind', ' Genuine, caring and full of kindness', \" She's gentle with my girls and "| __truncated__ ...
## $ Count of Sentiment : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
## $ Sum of Pos Score : num [1:793] 9.81 18.01 7.11 8.41 6.54 ...
## $ Count of Positive Sentiment: num [1:793] 12 21 8 9 9 7 15 10 7 8 ...
## $ Positive_Proportion : num [1:793] 0.857 0.913 0.889 1 1 ...
## $ Average pos_score : num [1:793] 0.817 0.858 0.889 0.934 0.726 ...
## $ Sum of Neg Score : num [1:793] -0.9394 -0.6902 -0.0018 0 0 ...
## $ Negative_Proportion : num [1:793] 0.143 0.087 0.111 0 0 ...
## $ Count of Negative Sentiment: num [1:793] 2 2 1 0 0 0 2 1 0 0 ...
## $ Average neg_score : num [1:793] -0.4697 -0.3451 -0.0018 0 0 ...
## $ Communication_positive : num [1:793] 0.0323 0.0876 0 0.0513 0.1 ...
## $ Expertisepositive : num [1:793] 0.145 0.073 0.139 0.103 0.267 ...
## $ Timepositive : num [1:793] 0.0806 0.0438 0.0556 0.1538 0.1667 ...
## $ Bedside_positive : num [1:793] 0 0.19 0.194 0.103 0.167 ...
## $ Officepositive : num [1:793] 0.0645 0.0949 0.1111 0.0513 0.0333 ...
## $ Costpositive : num [1:793] 0.0161 0.073 0.0833 0 0.0667 ...
## $ Communication_negative : num [1:793] 0.0484 0.0438 0.0278 0 0 ...
## $ Expertisenegative : num [1:793] 0.0645 0.0219 0.0278 0 0.0333 ...
## $ Time_negative : num [1:793] 0.0968 0.0511 0 0 0 ...
## $ Bedside_negative : num [1:793] 0.0161 0.0219 0 0 0 ...
## $ Office_negative : num [1:793] 0.0323 0.0365 0 0 0 ...
## $ Cost_negative : num [1:793] 0 0.0219 0 0 0 ...
## $ Word Count : num [1:793] 523 1604 288 358 280 ...
## $ Review Count : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
## $ Phrase Count : num [1:793] 67 165 38 42 48 32 133 93 41 67 ...
## $ Average words per review : num [1:793] 37 70 32 40 31 36 69 100 38 77 ...
## $ Overall_score : num [1:793] 0.348 0.513 0.887 0.934 0.726 ...
Healthgrades
library(readxl)
healthgrades <- read_excel("healthgradesfinal.xlsx")
str(healthgrades)
## tibble [247 × 33] (S3: tbl_df/tbl/data.frame)
## $ DoctorName : chr [1:247] "Dr. Amy Williams, MD" "Dr. Roma Franzia, MD" "Dr. Elizabeth Manjooran, MD" "Dr. Jason Canel, MD" ...
## $ Age : chr [1:247] "• Age 42" "• Age 50" "• Age 59" "• Age 48" ...
## $ Gender : chr [1:247] "Female" "Female" "Female" "Male" ...
## $ Speciality : chr [1:247] "Pediatrics" "Pediatrics" "Pediatrics" "Pediatrics" ...
## $ Years of Experience : num [1:247] 12 21 28 11 43 24 42 39 16 41 ...
## $ Biography : chr [1:247] "Dr. Amy Williams, MD is a pediatrics specialist in Chicago, IL. She specializes in pediatrics." "Dr. Roma Franzia, MD is a pediatrics specialist in Winnetka, IL and has been practicing for 21 years. She gradu"| __truncated__ "Dr. Elizabeth Manjooran, MD is a pediatrics specialist in Des Plaines, IL. She specializes in pediatrics." "Dr. Jason Canel, MD is a pediatrics specialist in Glenview, IL and has been practicing for 11 years. He graduat"| __truncated__ ...
## $ Rating : num [1:247] 4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
## $ Reviews : chr [1:247] "[\"I've been seeing Dr. Williams for years and through 2 pregnancies. She is the absolute best. Caring, compass"| __truncated__ "[\"Dr. Roma Franzia is the best pediatrician ever! So caring about my kids and family as a whole, always availa"| __truncated__ "['She’s not only a very good doctor but also a wonderful person! Also her office staff is very professional, wa"| __truncated__ "[\"We were assigned Dr Canel at my daughter's birth. I am so glad he was on call that day! My daughter is now t"| __truncated__ ...
## $ Review_cleaned : chr [1:247] "['[\"I\\'ve been seeing Dr', ' Williams for years ', ' through 2 pregnancies', ' She is the absolute best', ' C"| __truncated__ "['[\"Dr', ' Roma Franzia is the best pediatrician ever', ' So caring about my kids ', ' family as a whole', ' a"| __truncated__ "[\"['She’s not only a very good doctor \", ' also a wonderful person', ' Also her office staff is very professi"| __truncated__ "['[\"We were assigned Dr Canel at my daughter\\'s birth', ' I am so glad he was on call that day', ' My daughte"| __truncated__ ...
## $ Word Count : num [1:247] 614 793 176 199 84 182 144 690 50 68 ...
## $ Review Count : num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
## $ Phrase Count : num [1:247] 166 143 35 47 16 42 41 152 11 11 ...
## $ Total Count of Sentiment: num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
## $ Positive_Proportion : num [1:247] 1 1 1 1 1 ...
## $ Count of pos Sentiment : num [1:247] 21 11 4 5 2 3 5 16 3 2 ...
## $ Average pos_score : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
## $ Negative_Proportion : num [1:247] 0 0 0 0 0 ...
## $ Count of neg Sentiment : num [1:247] 0 0 0 0 0 0 0 1 0 0 ...
## $ Average neg_score : num [1:247] 0 0 0 0 0 0 0 -0.612 0 0 ...
## $ Communication_positive : num [1:247] 0.0676 0.0286 0.1 0.1538 0.3333 ...
## $ Expertisepositive : num [1:247] 0.189 0.114 0.1 0.154 0.167 ...
## $ Timepositive : num [1:247] 0.1216 0.0714 0 0.1154 0 ...
## $ Bedside_positive : num [1:247] 0.108 0.186 0.35 0.115 0.333 ...
## $ Officepositive : num [1:247] 0.027 0.0429 0.15 0.0769 0 ...
## $ Costpositive : num [1:247] 0.0541 0.0286 0 0.0385 0 ...
## $ Communication_negative : num [1:247] 0 0.0143 0 0 0 ...
## $ Expertisenegative : num [1:247] 0.027 0.0143 0 0 0 ...
## $ Time_negative : num [1:247] 0 0.0429 0 0 0 ...
## $ Bedside_negative : num [1:247] 0 0.0429 0 0 0 ...
## $ Office_negative : num [1:247] 0 0.0143 0 0 0 ...
## $ Cost_negative : num [1:247] 0 0 0 0 0 0 0 0 0 0 ...
## $ Average words per review: num [1:247] 29 72 44 40 42 61 29 41 17 34 ...
## $ Overall_score : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
healthgrades$Gender<- factor(healthgrades$Gender)
healthgrades$Speciality<- factor(healthgrades$Speciality)
table(healthgrades$Speciality)
##
## Allergy & Immunology Dermatology Family Medicine
## 3 1 2
## Internal Medicine Neonatal Medicine Pediatric Medicine
## 18 1 2
## Pediatrics
## 220
# ghetting only the numeric columsn
num_cols <- unlist(lapply(healthgrades, is.numeric))
health <- healthgrades[, num_cols]
str(health)
## tibble [247 × 26] (S3: tbl_df/tbl/data.frame)
## $ Years of Experience : num [1:247] 12 21 28 11 43 24 42 39 16 41 ...
## $ Rating : num [1:247] 4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
## $ Word Count : num [1:247] 614 793 176 199 84 182 144 690 50 68 ...
## $ Review Count : num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
## $ Phrase Count : num [1:247] 166 143 35 47 16 42 41 152 11 11 ...
## $ Total Count of Sentiment: num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
## $ Positive_Proportion : num [1:247] 1 1 1 1 1 ...
## $ Count of pos Sentiment : num [1:247] 21 11 4 5 2 3 5 16 3 2 ...
## $ Average pos_score : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
## $ Negative_Proportion : num [1:247] 0 0 0 0 0 ...
## $ Count of neg Sentiment : num [1:247] 0 0 0 0 0 0 0 1 0 0 ...
## $ Average neg_score : num [1:247] 0 0 0 0 0 0 0 -0.612 0 0 ...
## $ Communication_positive : num [1:247] 0.0676 0.0286 0.1 0.1538 0.3333 ...
## $ Expertisepositive : num [1:247] 0.189 0.114 0.1 0.154 0.167 ...
## $ Timepositive : num [1:247] 0.1216 0.0714 0 0.1154 0 ...
## $ Bedside_positive : num [1:247] 0.108 0.186 0.35 0.115 0.333 ...
## $ Officepositive : num [1:247] 0.027 0.0429 0.15 0.0769 0 ...
## $ Costpositive : num [1:247] 0.0541 0.0286 0 0.0385 0 ...
## $ Communication_negative : num [1:247] 0 0.0143 0 0 0 ...
## $ Expertisenegative : num [1:247] 0.027 0.0143 0 0 0 ...
## $ Time_negative : num [1:247] 0 0.0429 0 0 0 ...
## $ Bedside_negative : num [1:247] 0 0.0429 0 0 0 ...
## $ Office_negative : num [1:247] 0 0.0143 0 0 0 ...
## $ Cost_negative : num [1:247] 0 0 0 0 0 0 0 0 0 0 ...
## $ Average words per review: num [1:247] 29 72 44 40 42 61 29 41 17 34 ...
## $ Overall_score : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
# Print Descriptive Statistics for RateMDs
kable(xtable(describe(health)[c(3,4,5,8,9,11)]))
|
|
mean
|
sd
|
median
|
min
|
max
|
skew
|
|
Years of Experience
|
25.8600823
|
10.9100838
|
26.0000000
|
2.0000
|
53.0000000
|
0.1548193
|
|
Rating
|
4.5979757
|
0.3479848
|
4.6000000
|
3.5000
|
5.0000000
|
-0.5336239
|
|
Word Count
|
194.0971660
|
270.5785255
|
124.0000000
|
11.0000
|
2018.0000000
|
4.1629431
|
|
Review Count
|
4.4089069
|
6.7522054
|
3.0000000
|
1.0000
|
54.0000000
|
4.9852808
|
|
Phrase Count
|
39.4777328
|
56.4946083
|
23.0000000
|
3.0000
|
432.0000000
|
4.4631481
|
|
Total Count of Sentiment
|
4.7692308
|
9.5512835
|
3.0000000
|
1.0000
|
90.0000000
|
6.8203153
|
|
Positive_Proportion
|
0.9075924
|
0.1964834
|
1.0000000
|
0.0000
|
1.0000000
|
-2.6376345
|
|
Count of pos Sentiment
|
4.2995951
|
9.0171088
|
2.0000000
|
0.0000
|
86.0000000
|
7.0230331
|
|
Average pos_score
|
0.7945390
|
0.1824017
|
0.8376093
|
0.0000
|
0.9857000
|
-2.5855164
|
|
Negative_Proportion
|
0.0924076
|
0.1964834
|
0.0000000
|
0.0000
|
1.0000000
|
2.6376345
|
|
Count of neg Sentiment
|
0.4696356
|
1.0997455
|
0.0000000
|
0.0000
|
9.0000000
|
4.0034054
|
|
Average neg_score
|
-0.1475103
|
0.2808211
|
0.0000000
|
-0.9471
|
0.0000000
|
-1.6245998
|
|
Communication_positive
|
0.1074784
|
0.1175589
|
0.0769231
|
0.0000
|
0.7500000
|
1.5961346
|
|
Expertisepositive
|
0.1554605
|
0.1370798
|
0.1428571
|
0.0000
|
1.0000000
|
1.6127717
|
|
Timepositive
|
0.0940978
|
0.1083136
|
0.0666667
|
0.0000
|
0.5000000
|
1.4619393
|
|
Bedside_positive
|
0.1563275
|
0.1493452
|
0.1428571
|
0.0000
|
1.0000000
|
1.9710141
|
|
Officepositive
|
0.0472850
|
0.0756680
|
0.0000000
|
0.0000
|
0.3333333
|
1.7511825
|
|
Costpositive
|
0.0276534
|
0.0562286
|
0.0000000
|
0.0000
|
0.3333333
|
2.7729653
|
|
Communication_negative
|
0.0310048
|
0.0852062
|
0.0000000
|
0.0000
|
1.0000000
|
6.8384268
|
|
Expertisenegative
|
0.0308365
|
0.0874027
|
0.0000000
|
0.0000
|
1.0000000
|
6.7252228
|
|
Time_negative
|
0.0365831
|
0.0713895
|
0.0000000
|
0.0000
|
0.5000000
|
2.8473511
|
|
Bedside_negative
|
0.0183953
|
0.0478848
|
0.0000000
|
0.0000
|
0.3333333
|
3.7851412
|
|
Office_negative
|
0.0151664
|
0.0443944
|
0.0000000
|
0.0000
|
0.3333333
|
4.4347950
|
|
Cost_negative
|
0.0052563
|
0.0278533
|
0.0000000
|
0.0000
|
0.3333333
|
8.2021183
|
|
Average words per review
|
45.6032389
|
19.9854741
|
44.0000000
|
8.0000
|
135.0000000
|
0.7046350
|
|
Overall_score
|
0.6470287
|
0.3555858
|
0.7937000
|
-0.8750
|
0.9857000
|
-1.5541893
|
# Checking for Normality
library(ggpubr)
ggqqplot(health$Rating)

shapiro.test(health$Rating)
##
## Shapiro-Wilk normality test
##
## data: health$Rating
## W = 0.91311, p-value = 8.461e-11
ggqqplot(health$Positive_Proportion)

shapiro.test(health$Positive_Proportion)
##
## Shapiro-Wilk normality test
##
## data: health$Positive_Proportion
## W = 0.54009, p-value < 2.2e-16
ggqqplot(health$Negative_Proportion)

shapiro.test(health$Negative_Proportion)
##
## Shapiro-Wilk normality test
##
## data: health$Negative_Proportion
## W = 0.54009, p-value < 2.2e-16
ggqqplot(health$`Average pos_score`)

shapiro.test(health$`Average pos_score`)
##
## Shapiro-Wilk normality test
##
## data: health$`Average pos_score`
## W = 0.72734, p-value < 2.2e-16
ggqqplot(health$`Average neg_score`)

shapiro.test(health$`Average neg_score`)
##
## Shapiro-Wilk normality test
##
## data: health$`Average neg_score`
## W = 0.57777, p-value < 2.2e-16
## Multiple t-tests for RateMDs
a<- lapply(health[], function(x) t.test(x ~ healthgrades$Gender, var.equal = TRUE))
## Plots with Gender
health %>% #plot rating over Gender
ggplot(aes(x=Rating,fill=healthgrades$Gender))+
geom_density(alpha=.4,position="identity")+
labs(title = "Density of Rating by Gender on HealthGrades\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot Positive_Proportion over Gender
ggplot(aes(x=Positive_Proportion,fill=healthgrades$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Positive Proportion by Gender on HealthGrades\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot Negative_Proportion over Gender
ggplot(aes(x=Negative_Proportion,fill=healthgrades$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Negative_Proportion by Gender on HealthGrades\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot `Average neg_score` over Gender
ggplot(aes(x=`Average neg_score`,fill=healthgrades$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of `Average neg_score` by Gender on HealthGrades\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot `Average pos_score` over Gender
ggplot(aes(x=`Average pos_score`,fill=healthgrades$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of `Average pos_score` by Gender on HealthGrades\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot Years of Experience over Gender
ggplot(aes(x=`Years of Experience`,fill=healthgrades$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Years of Experience by Gender on HealthGrades\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Removed 4 rows containing non-finite values (stat_density).

health %>% #plot Overall_score over Gender
ggplot(aes(x=Overall_score,fill=healthgrades$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Overall_score by Gender on HealthGrades\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter

library(effsize)
##
## Attaching package: 'effsize'
## The following object is masked from 'package:psych':
##
## cohen.d
library(DT)
data_descriptives <- describe(rate)
datatable(data_descriptives[,c(3,4,5,8,9,11)]) %>%
formatRound(1:13, 2)
Combined
hcommon=data.frame(healthgrades[,c(3,5,20,21,22,23,24,25,26,27,28,29,30,31,32,7,14,17,16,19,33)])
dim(hcommon)
## [1] 247 21
rcommon=data.frame(ratemdsfinal[,c(3,4,17,18,19,20,21,22,23,24,25,26,27,28,32,6,11,14,12,16,33)])
dim(rcommon)
## [1] 793 21
dcombine=data.frame(rbind(hcommon,rcommon))
str(dcombine)
## 'data.frame': 1040 obs. of 21 variables:
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 1 2 2 2 1 ...
## $ Years.of.Experience : chr "12" "21" "28" "11" ...
## $ Communication_positive : num 0.0676 0.0286 0.1 0.1538 0.3333 ...
## $ Expertisepositive : num 0.189 0.114 0.1 0.154 0.167 ...
## $ Timepositive : num 0.1216 0.0714 0 0.1154 0 ...
## $ Bedside_positive : num 0.108 0.186 0.35 0.115 0.333 ...
## $ Officepositive : num 0.027 0.0429 0.15 0.0769 0 ...
## $ Costpositive : num 0.0541 0.0286 0 0.0385 0 ...
## $ Communication_negative : num 0 0.0143 0 0 0 ...
## $ Expertisenegative : num 0.027 0.0143 0 0 0 ...
## $ Time_negative : num 0 0.0429 0 0 0 ...
## $ Bedside_negative : num 0 0.0429 0 0 0 ...
## $ Office_negative : num 0 0.0143 0 0 0 ...
## $ Cost_negative : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Average.words.per.review: num 29 72 44 40 42 61 29 41 17 34 ...
## $ Rating : num 4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
## $ Positive_Proportion : num 1 1 1 1 1 ...
## $ Negative_Proportion : num 0 0 0 0 0 ...
## $ Average.pos_score : num 0.824 0.792 0.916 0.897 0.936 ...
## $ Average.neg_score : num 0 0 0 0 0 0 0 -0.612 0 0 ...
## $ Overall_score : num 0.824 0.792 0.916 0.897 0.936 ...
dcombine$Years.of.Experience <- factor(dcombine$Years.of.Experience)
# Print Descriptive Statistics for Combined
kable(xtable(describe(dcombine)[c(3,4,5,8,9,11)]))
|
|
mean
|
sd
|
median
|
min
|
max
|
skew
|
|
Gender*
|
1.4730769
|
0.4995148
|
1.0000000
|
1.000
|
2.0000
|
0.1076933
|
|
Years.of.Experience*
|
19.4922780
|
12.6408180
|
18.0000000
|
1.000
|
51.0000
|
1.2210775
|
|
Communication_positive
|
0.0651197
|
0.1146517
|
0.0000000
|
0.000
|
1.0000
|
3.1973305
|
|
Expertisepositive
|
0.1262908
|
0.1620018
|
0.0909091
|
0.000
|
1.0000
|
2.4456190
|
|
Timepositive
|
0.0627967
|
0.1055687
|
0.0000000
|
0.000
|
1.0000
|
2.7014127
|
|
Bedside_positive
|
0.1433768
|
0.1727294
|
0.1111111
|
0.000
|
1.0000
|
2.2205717
|
|
Officepositive
|
0.0544948
|
0.1060410
|
0.0000000
|
0.000
|
1.0000
|
3.5336472
|
|
Costpositive
|
0.0361672
|
0.0736005
|
0.0000000
|
0.000
|
0.5000
|
2.7902871
|
|
Communication_negative
|
0.0399820
|
0.0908382
|
0.0000000
|
0.000
|
1.0000
|
4.4908606
|
|
Expertisenegative
|
0.0546442
|
0.1206358
|
0.0000000
|
0.000
|
1.0000
|
4.3331605
|
|
Time_negative
|
0.0467875
|
0.0988836
|
0.0000000
|
0.000
|
1.0000
|
3.8977334
|
|
Bedside_negative
|
0.0299485
|
0.0944457
|
0.0000000
|
0.000
|
1.0000
|
6.6026214
|
|
Office_negative
|
0.0293538
|
0.0750176
|
0.0000000
|
0.000
|
0.6000
|
3.8922822
|
|
Cost_negative
|
0.0119775
|
0.0541032
|
0.0000000
|
0.000
|
1.0000
|
9.5204150
|
|
Average.words.per.review
|
50.9807692
|
47.9577896
|
41.0000000
|
0.000
|
737.0000
|
6.3078409
|
|
Rating
|
4.1334423
|
1.0470945
|
4.5000000
|
1.000
|
5.0000
|
-1.3967250
|
|
Positive_Proportion
|
0.7744462
|
0.3391379
|
1.0000000
|
0.000
|
1.0000
|
-1.3426505
|
|
Negative_Proportion
|
0.1945310
|
0.3203035
|
0.0000000
|
0.000
|
1.0000
|
1.5719105
|
|
Average.pos_score
|
0.6917569
|
0.2982497
|
0.7988333
|
0.000
|
0.9923
|
-1.4246449
|
|
Average.neg_score
|
-0.2221033
|
0.3278596
|
0.0000000
|
-0.981
|
0.0000
|
-1.0272195
|
|
Overall_score
|
0.4696536
|
0.5107502
|
0.6907250
|
-0.981
|
0.9914
|
-1.1026585
|
## Multiple t-tests for RateMDs
a<- lapply(dcombine[,-c(1:2)], function(x) t.test(x ~ dcombine$Gender, var.equal = TRUE))
dcombine %>% #plot rating over Gender
ggplot(aes(x=Rating,fill=dcombine$Gender))+
geom_density(alpha=.4,position="identity")+
labs(title = "Density of Rating by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Positive_Proportion over Gender
ggplot(aes(x=Positive_Proportion,fill=dcombine$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Positive Proportion by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Negative_Proportion over Gender
ggplot(aes(x=Negative_Proportion,fill=dcombine$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Negative_Proportion by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot `Average neg_score` over Gender
ggplot(aes(x=Average.neg_score,fill=dcombine$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of `Average neg_score` by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot `Average pos_score` over Gender
ggplot(aes(x=Average.pos_score,fill=dcombine$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of `Average pos_score` by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Years of Experience over Gender
ggplot(aes(x=Years.of.Experience,fill=dcombine$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Years of Experience by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.

dcombine %>% #plot Overall_score over Gender
ggplot(aes(x=Overall_score,fill=dcombine$Gender))+
geom_density(alpha=.5,position="identity")+
labs(title = "Density of Overall_score by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.
