Descriptive Statistics

library(readxl)
ratemdsfinal <- read_excel("ratemdsfinal.xlsx")
## New names:
## * `` -> ...1
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(xtable)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(knitr)
library(psych)
colSums(is.na(ratemdsfinal))
##                        ...1                Doctor_names 
##                           0                           0 
##                      Gender         Years of Experience 
##                           0                          59 
##              Doctor_reviews                      Rating 
##                           0                           0 
##            Review_sentences          Count of Sentiment 
##                           0                           0 
##            Sum of Pos Score Count of Positive Sentiment 
##                           0                           0 
##         Positive_Proportion           Average pos_score 
##                           0                           0 
##            Sum of Neg Score         Negative_Proportion 
##                           0                           0 
## Count of Negative Sentiment           Average neg_score 
##                           0                           0 
##      Communication_positive           Expertisepositive 
##                           0                           0 
##                Timepositive            Bedside_positive 
##                           0                           0 
##              Officepositive                Costpositive 
##                           0                           0 
##      Communication_negative           Expertisenegative 
##                           0                           0 
##               Time_negative            Bedside_negative 
##                           0                           0 
##             Office_negative               Cost_negative 
##                           0                           0 
##                  Word Count                Review Count 
##                           0                           0 
##                Phrase Count    Average words per review 
##                           0                           0 
##               Overall_score 
##                           0
ratemdsfinal$`Years of Experience`[is.na(ratemdsfinal$`Years of Experience`)]=round(mean(ratemdsfinal$`Years of Experience`,na.rm = T))

# ghetting only the numeric columsn 
num_cols <- unlist(lapply(ratemdsfinal, is.numeric))  
rate <- ratemdsfinal[, num_cols]

rate$...1<- NULL
str(rate)
## tibble [793 × 28] (S3: tbl_df/tbl/data.frame)
##  $ Years of Experience        : num [1:793] 21 15 26 21 27 38 49 36 25 26 ...
##  $ Rating                     : num [1:793] 4.85 4.47 4.92 4.89 4.89 5 4.44 4.75 4.93 4.78 ...
##  $ Count of Sentiment         : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Sum of Pos Score           : num [1:793] 9.81 18.01 7.11 8.41 6.54 ...
##  $ Count of Positive Sentiment: num [1:793] 12 21 8 9 9 7 15 10 7 8 ...
##  $ Positive_Proportion        : num [1:793] 0.857 0.913 0.889 1 1 ...
##  $ Average pos_score          : num [1:793] 0.817 0.858 0.889 0.934 0.726 ...
##  $ Sum of Neg Score           : num [1:793] -0.9394 -0.6902 -0.0018 0 0 ...
##  $ Negative_Proportion        : num [1:793] 0.143 0.087 0.111 0 0 ...
##  $ Count of Negative Sentiment: num [1:793] 2 2 1 0 0 0 2 1 0 0 ...
##  $ Average neg_score          : num [1:793] -0.4697 -0.3451 -0.0018 0 0 ...
##  $ Communication_positive     : num [1:793] 0.0323 0.0876 0 0.0513 0.1 ...
##  $ Expertisepositive          : num [1:793] 0.145 0.073 0.139 0.103 0.267 ...
##  $ Timepositive               : num [1:793] 0.0806 0.0438 0.0556 0.1538 0.1667 ...
##  $ Bedside_positive           : num [1:793] 0 0.19 0.194 0.103 0.167 ...
##  $ Officepositive             : num [1:793] 0.0645 0.0949 0.1111 0.0513 0.0333 ...
##  $ Costpositive               : num [1:793] 0.0161 0.073 0.0833 0 0.0667 ...
##  $ Communication_negative     : num [1:793] 0.0484 0.0438 0.0278 0 0 ...
##  $ Expertisenegative          : num [1:793] 0.0645 0.0219 0.0278 0 0.0333 ...
##  $ Time_negative              : num [1:793] 0.0968 0.0511 0 0 0 ...
##  $ Bedside_negative           : num [1:793] 0.0161 0.0219 0 0 0 ...
##  $ Office_negative            : num [1:793] 0.0323 0.0365 0 0 0 ...
##  $ Cost_negative              : num [1:793] 0 0.0219 0 0 0 ...
##  $ Word Count                 : num [1:793] 523 1604 288 358 280 ...
##  $ Review Count               : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Phrase Count               : num [1:793] 67 165 38 42 48 32 133 93 41 67 ...
##  $ Average words per review   : num [1:793] 37 70 32 40 31 36 69 100 38 77 ...
##  $ Overall_score              : num [1:793] 0.348 0.513 0.887 0.934 0.726 ...
names(rate)[1]<- "Exp"

# Print Descriptive Statistics for RateMDs
kable(xtable(describe(rate)[c(3,4,5,8,9,11)]))
mean sd median min max skew
Exp 25.9394704 10.7023017 26.0000000 2.0000 56.0000 0.2925266
Rating 3.9887516 1.1456304 4.5000000 1.0000 5.0000 -1.0690984
Count of Sentiment 2.9621690 3.2622855 2.0000000 1.0000 30.0000 3.0905276
Sum of Pos Score 1.6952861 2.0505289 0.9493000 0.0000 18.0120 2.9120548
Count of Positive Sentiment 2.1601513 2.5047079 1.0000000 0.0000 21.0000 2.8523571
Positive_Proportion 0.7329744 0.3628249 1.0000000 0.0000 1.0000 -1.0791118
Average pos_score 0.6597428 0.3194366 0.7778000 0.0000 0.9923 -1.1775197
Sum of Neg Score -0.4450890 0.8923222 0.0000000 -10.8437 0.0000 -4.3611618
Negative_Proportion 0.2263399 0.3439964 0.0000000 0.0000 1.0000 1.3315686
Count of Negative Sentiment 0.6935687 1.2951625 0.0000000 0.0000 15.0000 4.1368418
Average neg_score -0.2453372 0.3380014 0.0000000 -0.9810 0.0000 -0.8781404
Communication_positive 0.0519260 0.1105353 0.0000000 0.0000 1.0000 4.0196863
Expertisepositive 0.1172052 0.1680621 0.0666667 0.0000 1.0000 2.6532162
Timepositive 0.0530472 0.1028384 0.0000000 0.0000 1.0000 3.2624407
Bedside_positive 0.1393430 0.1792870 0.1000000 0.0000 1.0000 2.2700317
Officepositive 0.0567404 0.1138064 0.0000000 0.0000 1.0000 3.5556046
Costpositive 0.0388190 0.0780689 0.0000000 0.0000 0.5000 2.6948463
Communication_negative 0.0427782 0.0923958 0.0000000 0.0000 1.0000 3.9127936
Expertisenegative 0.0620597 0.1284017 0.0000000 0.0000 1.0000 3.9608482
Time_negative 0.0499659 0.1058386 0.0000000 0.0000 1.0000 3.8477383
Bedside_negative 0.0335470 0.1045708 0.0000000 0.0000 1.0000 6.1851972
Office_negative 0.0337728 0.0817816 0.0000000 0.0000 0.6000 3.6060950
Cost_negative 0.0140710 0.0598381 0.0000000 0.0000 1.0000 8.8894980
Word Count 169.6973518 250.4917975 78.0000000 0.0000 2350.0000 3.5365532
Review Count 2.9621690 3.2622855 2.0000000 1.0000 30.0000 3.0905276
Phrase Count 17.4186633 23.0809676 9.0000000 1.0000 195.0000 3.4406515
Average words per review 52.6557377 53.6781351 40.0000000 0.0000 737.0000 5.8080336
Overall_score 0.4144056 0.5385908 0.6249000 -0.9810 0.9914 -0.9302287
# Checking for Normality 
library(ggpubr)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: magrittr
ggqqplot(rate$Rating)

shapiro.test(rate$Rating)
## 
##  Shapiro-Wilk normality test
## 
## data:  rate$Rating
## W = 0.83136, p-value < 2.2e-16
ggqqplot(rate$Positive_Proportion)

shapiro.test(rate$Positive_Proportion)
## 
##  Shapiro-Wilk normality test
## 
## data:  rate$Positive_Proportion
## W = 0.72269, p-value < 2.2e-16
ggqqplot(rate$Negative_Proportion)

shapiro.test(rate$Negative_Proportion)
## 
##  Shapiro-Wilk normality test
## 
## data:  rate$Negative_Proportion
## W = 0.67902, p-value < 2.2e-16
ggqqplot(rate$`Average pos_score`)

shapiro.test(rate$`Average pos_score`)
## 
##  Shapiro-Wilk normality test
## 
## data:  rate$`Average pos_score`
## W = 0.78975, p-value < 2.2e-16
ggqqplot(rate$`Average neg_score`)

shapiro.test(rate$`Average neg_score`)
## 
##  Shapiro-Wilk normality test
## 
## data:  rate$`Average neg_score`
## W = 0.7132, p-value < 2.2e-16
## Multiple t-tests for RateMDs
a<- lapply(rate[], function(x) t.test(x ~ ratemdsfinal$Gender, var.equal = TRUE))

Statistics based on Gender

#str(ratemdsfinal)
ratemdsfinal$Gender<- factor(ratemdsfinal$Gender)

rate %>% #plot rating over Gender 
  ggplot(aes(x=Rating,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.4,position="identity")+
  labs(title = "Density of Rating by Gender on RateMDs\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Positive_Proportion over Gender 
  ggplot(aes(x=Positive_Proportion,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Positive Proportion by Gender on RateMDs\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Negative_Proportion over Gender 
  ggplot(aes(x=Negative_Proportion,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Negative_Proportion by Gender on RateMDs\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot `Average neg_score` over Gender 
  ggplot(aes(x=`Average neg_score`,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average neg_score` by Gender on RateMDs\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot `Average pos_score` over Gender 
  ggplot(aes(x=`Average pos_score`,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average pos_score` by Gender on RateMDs\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Years of Experience over Gender 
  ggplot(aes(x=Exp,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Years of Experience by Gender on RateMDs\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Overall_score over Gender 
  ggplot(aes(x=Overall_score,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Overall_score by Gender on RateMDs\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Statistics by No of Years of Experience

colSums(is.na(ratemdsfinal))
##                        ...1                Doctor_names 
##                           0                           0 
##                      Gender         Years of Experience 
##                           0                           0 
##              Doctor_reviews                      Rating 
##                           0                           0 
##            Review_sentences          Count of Sentiment 
##                           0                           0 
##            Sum of Pos Score Count of Positive Sentiment 
##                           0                           0 
##         Positive_Proportion           Average pos_score 
##                           0                           0 
##            Sum of Neg Score         Negative_Proportion 
##                           0                           0 
## Count of Negative Sentiment           Average neg_score 
##                           0                           0 
##      Communication_positive           Expertisepositive 
##                           0                           0 
##                Timepositive            Bedside_positive 
##                           0                           0 
##              Officepositive                Costpositive 
##                           0                           0 
##      Communication_negative           Expertisenegative 
##                           0                           0 
##               Time_negative            Bedside_negative 
##                           0                           0 
##             Office_negative               Cost_negative 
##                           0                           0 
##                  Word Count                Review Count 
##                           0                           0 
##                Phrase Count    Average words per review 
##                           0                           0 
##               Overall_score 
##                           0
r<- ratemdsfinal

summary(ratemdsfinal$`Years of Experience`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   19.00   26.00   25.94   33.00   56.00
ratemdsfinal$`Years of Experience` <- as.factor(ratemdsfinal$`Years of Experience`)
levels(ratemdsfinal$`Years of Experience`) = list("Less than 6 years" = c(0:6),
                                        "7 to 14 years " = c(7:14), 
                                        "15 to 23 years" = c(15:23), 
                                        "24 and above" = c(24:56))
summary(ratemdsfinal$`Years of Experience`)
## Less than 6 years    7 to 14 years     15 to 23 years      24 and above 
##                19                86               240               448
# shows how the rating is divided between No of years of experience of the doctor
ggplot(data = rate, mapping = aes(x =Rating)) +
  geom_density(alpha=.4,position="identity")+
  facet_wrap(~ ratemdsfinal$`Years of Experience`)+
  ggtitle("Vistualising Years of Experience over Rating")+
  xlab("CRating")+
  theme_bw()

rate %>% 
  ggplot(aes(x=Rating,fill=ratemdsfinal$`Years of Experience`))+
  geom_density(alpha=.7,position="identity")+
  labs(title = "Density of Rating over Years of Experience on RateMDs",x="Rating Received on RateMDs",y = "Density of Rating")

str(ratemdsfinal)
## tibble [793 × 33] (S3: tbl_df/tbl/data.frame)
##  $ ...1                       : num [1:793] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Doctor_names               : chr [1:793] "Dr. Martine T. Nelson" "Dr. Jason C. Tjaden" "Dr. Paul Tortoriello" "Dr. Maria Rosa" ...
##  $ Gender                     : Factor w/ 2 levels "Female","Male": 1 2 2 1 1 1 2 2 2 1 ...
##  $ Years of Experience        : Factor w/ 4 levels "Less than 6 years",..: 3 3 4 3 4 4 4 4 4 4 ...
##  $ Doctor_reviews             : chr [1:793] "['Dr Nelson and her staff are wonderful! They are always on time and very caring!', 'Dr. Nelson was our family "| __truncated__ "['We love Dr. Tjaden! Our daughter had medical issues when she was born and he was incredibly supportive, affir"| __truncated__ "['Excellent doctor! Absolute best pediatrician. We switched from another local doctor who misdiagnosed my daugh"| __truncated__ "[\"Dr. Rosa is one of a kind. Genuine, caring and full of kindness. She's gentle with my girls and they LOVE he"| __truncated__ ...
##  $ Rating                     : num [1:793] 4.85 4.47 4.92 4.89 4.89 5 4.44 4.75 4.93 4.78 ...
##  $ Review_sentences           : chr [1:793] "[\"['Dr Nelson and her staff are wonderful\", ' They are always on time and very caring', \", 'Dr\", ' Nelson w"| __truncated__ "[\"['We love Dr\", ' Tjaden', ' Our daughter had medical issues when she was born and he was incredibly support"| __truncated__ "[\"['Excellent doctor\", ' Absolute best pediatrician', ' We switched from another local doctor who misdiagnose"| __truncated__ "['[\"Dr', ' Rosa is one of a kind', ' Genuine, caring and full of kindness', \" She's gentle with my girls and "| __truncated__ ...
##  $ Count of Sentiment         : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Sum of Pos Score           : num [1:793] 9.81 18.01 7.11 8.41 6.54 ...
##  $ Count of Positive Sentiment: num [1:793] 12 21 8 9 9 7 15 10 7 8 ...
##  $ Positive_Proportion        : num [1:793] 0.857 0.913 0.889 1 1 ...
##  $ Average pos_score          : num [1:793] 0.817 0.858 0.889 0.934 0.726 ...
##  $ Sum of Neg Score           : num [1:793] -0.9394 -0.6902 -0.0018 0 0 ...
##  $ Negative_Proportion        : num [1:793] 0.143 0.087 0.111 0 0 ...
##  $ Count of Negative Sentiment: num [1:793] 2 2 1 0 0 0 2 1 0 0 ...
##  $ Average neg_score          : num [1:793] -0.4697 -0.3451 -0.0018 0 0 ...
##  $ Communication_positive     : num [1:793] 0.0323 0.0876 0 0.0513 0.1 ...
##  $ Expertisepositive          : num [1:793] 0.145 0.073 0.139 0.103 0.267 ...
##  $ Timepositive               : num [1:793] 0.0806 0.0438 0.0556 0.1538 0.1667 ...
##  $ Bedside_positive           : num [1:793] 0 0.19 0.194 0.103 0.167 ...
##  $ Officepositive             : num [1:793] 0.0645 0.0949 0.1111 0.0513 0.0333 ...
##  $ Costpositive               : num [1:793] 0.0161 0.073 0.0833 0 0.0667 ...
##  $ Communication_negative     : num [1:793] 0.0484 0.0438 0.0278 0 0 ...
##  $ Expertisenegative          : num [1:793] 0.0645 0.0219 0.0278 0 0.0333 ...
##  $ Time_negative              : num [1:793] 0.0968 0.0511 0 0 0 ...
##  $ Bedside_negative           : num [1:793] 0.0161 0.0219 0 0 0 ...
##  $ Office_negative            : num [1:793] 0.0323 0.0365 0 0 0 ...
##  $ Cost_negative              : num [1:793] 0 0.0219 0 0 0 ...
##  $ Word Count                 : num [1:793] 523 1604 288 358 280 ...
##  $ Review Count               : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Phrase Count               : num [1:793] 67 165 38 42 48 32 133 93 41 67 ...
##  $ Average words per review   : num [1:793] 37 70 32 40 31 36 69 100 38 77 ...
##  $ Overall_score              : num [1:793] 0.348 0.513 0.887 0.934 0.726 ...

Healthgrades

library(readxl)
healthgrades <- read_excel("healthgradesfinal.xlsx")

str(healthgrades)
## tibble [247 × 33] (S3: tbl_df/tbl/data.frame)
##  $ DoctorName              : chr [1:247] "Dr. Amy Williams, MD" "Dr. Roma Franzia, MD" "Dr. Elizabeth Manjooran, MD" "Dr. Jason Canel, MD" ...
##  $ Age                     : chr [1:247] "• Age 42" "• Age 50" "• Age 59" "• Age 48" ...
##  $ Gender                  : chr [1:247] "Female" "Female" "Female" "Male" ...
##  $ Speciality              : chr [1:247] "Pediatrics" "Pediatrics" "Pediatrics" "Pediatrics" ...
##  $ Years of Experience     : num [1:247] 12 21 28 11 43 24 42 39 16 41 ...
##  $ Biography               : chr [1:247] "Dr. Amy Williams, MD is a pediatrics specialist in Chicago, IL. She specializes in pediatrics." "Dr. Roma Franzia, MD is a pediatrics specialist in Winnetka, IL and has been practicing for 21 years. She gradu"| __truncated__ "Dr. Elizabeth Manjooran, MD is a pediatrics specialist in Des Plaines, IL. She specializes in pediatrics." "Dr. Jason Canel, MD is a pediatrics specialist in Glenview, IL and has been practicing for 11 years. He graduat"| __truncated__ ...
##  $ Rating                  : num [1:247] 4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
##  $ Reviews                 : chr [1:247] "[\"I've been seeing Dr. Williams for years and through 2 pregnancies. She is the absolute best. Caring, compass"| __truncated__ "[\"Dr. Roma Franzia is the best pediatrician ever! So caring about my kids and family as a whole, always availa"| __truncated__ "['She’s not only a very good doctor but also a wonderful person! Also her office staff is very professional, wa"| __truncated__ "[\"We were assigned Dr Canel at my daughter's birth. I am so glad he was on call that day! My daughter is now t"| __truncated__ ...
##  $ Review_cleaned          : chr [1:247] "['[\"I\\'ve been seeing Dr', ' Williams for years ', ' through 2 pregnancies', ' She is the absolute best', ' C"| __truncated__ "['[\"Dr', ' Roma Franzia is the best pediatrician ever', ' So caring about my kids ', ' family as a whole', ' a"| __truncated__ "[\"['She’s not only a very good doctor \", ' also a wonderful person', ' Also her office staff is very professi"| __truncated__ "['[\"We were assigned Dr Canel at my daughter\\'s birth', ' I am so glad he was on call that day', ' My daughte"| __truncated__ ...
##  $ Word Count              : num [1:247] 614 793 176 199 84 182 144 690 50 68 ...
##  $ Review Count            : num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Phrase Count            : num [1:247] 166 143 35 47 16 42 41 152 11 11 ...
##  $ Total Count of Sentiment: num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Positive_Proportion     : num [1:247] 1 1 1 1 1 ...
##  $ Count of pos Sentiment  : num [1:247] 21 11 4 5 2 3 5 16 3 2 ...
##  $ Average pos_score       : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
##  $ Negative_Proportion     : num [1:247] 0 0 0 0 0 ...
##  $ Count of neg Sentiment  : num [1:247] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Average neg_score       : num [1:247] 0 0 0 0 0 0 0 -0.612 0 0 ...
##  $ Communication_positive  : num [1:247] 0.0676 0.0286 0.1 0.1538 0.3333 ...
##  $ Expertisepositive       : num [1:247] 0.189 0.114 0.1 0.154 0.167 ...
##  $ Timepositive            : num [1:247] 0.1216 0.0714 0 0.1154 0 ...
##  $ Bedside_positive        : num [1:247] 0.108 0.186 0.35 0.115 0.333 ...
##  $ Officepositive          : num [1:247] 0.027 0.0429 0.15 0.0769 0 ...
##  $ Costpositive            : num [1:247] 0.0541 0.0286 0 0.0385 0 ...
##  $ Communication_negative  : num [1:247] 0 0.0143 0 0 0 ...
##  $ Expertisenegative       : num [1:247] 0.027 0.0143 0 0 0 ...
##  $ Time_negative           : num [1:247] 0 0.0429 0 0 0 ...
##  $ Bedside_negative        : num [1:247] 0 0.0429 0 0 0 ...
##  $ Office_negative         : num [1:247] 0 0.0143 0 0 0 ...
##  $ Cost_negative           : num [1:247] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Average words per review: num [1:247] 29 72 44 40 42 61 29 41 17 34 ...
##  $ Overall_score           : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
healthgrades$Gender<- factor(healthgrades$Gender)

healthgrades$Speciality<- factor(healthgrades$Speciality)

table(healthgrades$Speciality)
## 
## Allergy & Immunology          Dermatology      Family Medicine 
##                    3                    1                    2 
##    Internal Medicine    Neonatal Medicine   Pediatric Medicine 
##                   18                    1                    2 
##           Pediatrics 
##                  220
# ghetting only the numeric columsn 
num_cols <- unlist(lapply(healthgrades, is.numeric))  
health <- healthgrades[, num_cols]

str(health)
## tibble [247 × 26] (S3: tbl_df/tbl/data.frame)
##  $ Years of Experience     : num [1:247] 12 21 28 11 43 24 42 39 16 41 ...
##  $ Rating                  : num [1:247] 4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
##  $ Word Count              : num [1:247] 614 793 176 199 84 182 144 690 50 68 ...
##  $ Review Count            : num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Phrase Count            : num [1:247] 166 143 35 47 16 42 41 152 11 11 ...
##  $ Total Count of Sentiment: num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Positive_Proportion     : num [1:247] 1 1 1 1 1 ...
##  $ Count of pos Sentiment  : num [1:247] 21 11 4 5 2 3 5 16 3 2 ...
##  $ Average pos_score       : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
##  $ Negative_Proportion     : num [1:247] 0 0 0 0 0 ...
##  $ Count of neg Sentiment  : num [1:247] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Average neg_score       : num [1:247] 0 0 0 0 0 0 0 -0.612 0 0 ...
##  $ Communication_positive  : num [1:247] 0.0676 0.0286 0.1 0.1538 0.3333 ...
##  $ Expertisepositive       : num [1:247] 0.189 0.114 0.1 0.154 0.167 ...
##  $ Timepositive            : num [1:247] 0.1216 0.0714 0 0.1154 0 ...
##  $ Bedside_positive        : num [1:247] 0.108 0.186 0.35 0.115 0.333 ...
##  $ Officepositive          : num [1:247] 0.027 0.0429 0.15 0.0769 0 ...
##  $ Costpositive            : num [1:247] 0.0541 0.0286 0 0.0385 0 ...
##  $ Communication_negative  : num [1:247] 0 0.0143 0 0 0 ...
##  $ Expertisenegative       : num [1:247] 0.027 0.0143 0 0 0 ...
##  $ Time_negative           : num [1:247] 0 0.0429 0 0 0 ...
##  $ Bedside_negative        : num [1:247] 0 0.0429 0 0 0 ...
##  $ Office_negative         : num [1:247] 0 0.0143 0 0 0 ...
##  $ Cost_negative           : num [1:247] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Average words per review: num [1:247] 29 72 44 40 42 61 29 41 17 34 ...
##  $ Overall_score           : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
# Print Descriptive Statistics for RateMDs
kable(xtable(describe(health)[c(3,4,5,8,9,11)]))
mean sd median min max skew
Years of Experience 25.8600823 10.9100838 26.0000000 2.0000 53.0000000 0.1548193
Rating 4.5979757 0.3479848 4.6000000 3.5000 5.0000000 -0.5336239
Word Count 194.0971660 270.5785255 124.0000000 11.0000 2018.0000000 4.1629431
Review Count 4.4089069 6.7522054 3.0000000 1.0000 54.0000000 4.9852808
Phrase Count 39.4777328 56.4946083 23.0000000 3.0000 432.0000000 4.4631481
Total Count of Sentiment 4.7692308 9.5512835 3.0000000 1.0000 90.0000000 6.8203153
Positive_Proportion 0.9075924 0.1964834 1.0000000 0.0000 1.0000000 -2.6376345
Count of pos Sentiment 4.2995951 9.0171088 2.0000000 0.0000 86.0000000 7.0230331
Average pos_score 0.7945390 0.1824017 0.8376093 0.0000 0.9857000 -2.5855164
Negative_Proportion 0.0924076 0.1964834 0.0000000 0.0000 1.0000000 2.6376345
Count of neg Sentiment 0.4696356 1.0997455 0.0000000 0.0000 9.0000000 4.0034054
Average neg_score -0.1475103 0.2808211 0.0000000 -0.9471 0.0000000 -1.6245998
Communication_positive 0.1074784 0.1175589 0.0769231 0.0000 0.7500000 1.5961346
Expertisepositive 0.1554605 0.1370798 0.1428571 0.0000 1.0000000 1.6127717
Timepositive 0.0940978 0.1083136 0.0666667 0.0000 0.5000000 1.4619393
Bedside_positive 0.1563275 0.1493452 0.1428571 0.0000 1.0000000 1.9710141
Officepositive 0.0472850 0.0756680 0.0000000 0.0000 0.3333333 1.7511825
Costpositive 0.0276534 0.0562286 0.0000000 0.0000 0.3333333 2.7729653
Communication_negative 0.0310048 0.0852062 0.0000000 0.0000 1.0000000 6.8384268
Expertisenegative 0.0308365 0.0874027 0.0000000 0.0000 1.0000000 6.7252228
Time_negative 0.0365831 0.0713895 0.0000000 0.0000 0.5000000 2.8473511
Bedside_negative 0.0183953 0.0478848 0.0000000 0.0000 0.3333333 3.7851412
Office_negative 0.0151664 0.0443944 0.0000000 0.0000 0.3333333 4.4347950
Cost_negative 0.0052563 0.0278533 0.0000000 0.0000 0.3333333 8.2021183
Average words per review 45.6032389 19.9854741 44.0000000 8.0000 135.0000000 0.7046350
Overall_score 0.6470287 0.3555858 0.7937000 -0.8750 0.9857000 -1.5541893
# Checking for Normality 
library(ggpubr)
ggqqplot(health$Rating)

shapiro.test(health$Rating)
## 
##  Shapiro-Wilk normality test
## 
## data:  health$Rating
## W = 0.91311, p-value = 8.461e-11
ggqqplot(health$Positive_Proportion)

shapiro.test(health$Positive_Proportion)
## 
##  Shapiro-Wilk normality test
## 
## data:  health$Positive_Proportion
## W = 0.54009, p-value < 2.2e-16
ggqqplot(health$Negative_Proportion)

shapiro.test(health$Negative_Proportion)
## 
##  Shapiro-Wilk normality test
## 
## data:  health$Negative_Proportion
## W = 0.54009, p-value < 2.2e-16
ggqqplot(health$`Average pos_score`)

shapiro.test(health$`Average pos_score`)
## 
##  Shapiro-Wilk normality test
## 
## data:  health$`Average pos_score`
## W = 0.72734, p-value < 2.2e-16
ggqqplot(health$`Average neg_score`)

shapiro.test(health$`Average neg_score`)
## 
##  Shapiro-Wilk normality test
## 
## data:  health$`Average neg_score`
## W = 0.57777, p-value < 2.2e-16
## Multiple t-tests for RateMDs
a<- lapply(health[], function(x) t.test(x ~ healthgrades$Gender, var.equal = TRUE))

## Plots with Gender 
health %>% #plot rating over Gender 
  ggplot(aes(x=Rating,fill=healthgrades$Gender))+
  geom_density(alpha=.4,position="identity")+
  labs(title = "Density of Rating by Gender on HealthGrades\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot Positive_Proportion over Gender 
  ggplot(aes(x=Positive_Proportion,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Positive Proportion by Gender on HealthGrades\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health  %>% #plot Negative_Proportion over Gender 
  ggplot(aes(x=Negative_Proportion,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Negative_Proportion by Gender on HealthGrades\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health  %>% #plot `Average neg_score` over Gender 
  ggplot(aes(x=`Average neg_score`,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average neg_score` by Gender on HealthGrades\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot `Average pos_score` over Gender 
  ggplot(aes(x=`Average pos_score`,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average pos_score` by Gender on HealthGrades\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health  %>% #plot Years of Experience over Gender 
  ggplot(aes(x=`Years of Experience`,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Years of Experience by Gender on HealthGrades\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Removed 4 rows containing non-finite values (stat_density).

health %>% #plot Overall_score over Gender 
  ggplot(aes(x=Overall_score,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Overall_score by Gender on HealthGrades\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

library(effsize)
## 
## Attaching package: 'effsize'
## The following object is masked from 'package:psych':
## 
##     cohen.d
library(DT)

data_descriptives <- describe(rate)
datatable(data_descriptives[,c(3,4,5,8,9,11)]) %>%
  formatRound(1:13, 2)

Combined

hcommon=data.frame(healthgrades[,c(3,5,20,21,22,23,24,25,26,27,28,29,30,31,32,7,14,17,16,19,33)])
dim(hcommon)
## [1] 247  21
rcommon=data.frame(ratemdsfinal[,c(3,4,17,18,19,20,21,22,23,24,25,26,27,28,32,6,11,14,12,16,33)])
dim(rcommon)
## [1] 793  21
dcombine=data.frame(rbind(hcommon,rcommon))
str(dcombine)
## 'data.frame':    1040 obs. of  21 variables:
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 1 2 2 2 1 ...
##  $ Years.of.Experience     : chr  "12" "21" "28" "11" ...
##  $ Communication_positive  : num  0.0676 0.0286 0.1 0.1538 0.3333 ...
##  $ Expertisepositive       : num  0.189 0.114 0.1 0.154 0.167 ...
##  $ Timepositive            : num  0.1216 0.0714 0 0.1154 0 ...
##  $ Bedside_positive        : num  0.108 0.186 0.35 0.115 0.333 ...
##  $ Officepositive          : num  0.027 0.0429 0.15 0.0769 0 ...
##  $ Costpositive            : num  0.0541 0.0286 0 0.0385 0 ...
##  $ Communication_negative  : num  0 0.0143 0 0 0 ...
##  $ Expertisenegative       : num  0.027 0.0143 0 0 0 ...
##  $ Time_negative           : num  0 0.0429 0 0 0 ...
##  $ Bedside_negative        : num  0 0.0429 0 0 0 ...
##  $ Office_negative         : num  0 0.0143 0 0 0 ...
##  $ Cost_negative           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Average.words.per.review: num  29 72 44 40 42 61 29 41 17 34 ...
##  $ Rating                  : num  4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
##  $ Positive_Proportion     : num  1 1 1 1 1 ...
##  $ Negative_Proportion     : num  0 0 0 0 0 ...
##  $ Average.pos_score       : num  0.824 0.792 0.916 0.897 0.936 ...
##  $ Average.neg_score       : num  0 0 0 0 0 0 0 -0.612 0 0 ...
##  $ Overall_score           : num  0.824 0.792 0.916 0.897 0.936 ...
dcombine$Years.of.Experience <- factor(dcombine$Years.of.Experience)

# Print Descriptive Statistics for Combined
kable(xtable(describe(dcombine)[c(3,4,5,8,9,11)]))
mean sd median min max skew
Gender* 1.4730769 0.4995148 1.0000000 1.000 2.0000 0.1076933
Years.of.Experience* 19.4922780 12.6408180 18.0000000 1.000 51.0000 1.2210775
Communication_positive 0.0651197 0.1146517 0.0000000 0.000 1.0000 3.1973305
Expertisepositive 0.1262908 0.1620018 0.0909091 0.000 1.0000 2.4456190
Timepositive 0.0627967 0.1055687 0.0000000 0.000 1.0000 2.7014127
Bedside_positive 0.1433768 0.1727294 0.1111111 0.000 1.0000 2.2205717
Officepositive 0.0544948 0.1060410 0.0000000 0.000 1.0000 3.5336472
Costpositive 0.0361672 0.0736005 0.0000000 0.000 0.5000 2.7902871
Communication_negative 0.0399820 0.0908382 0.0000000 0.000 1.0000 4.4908606
Expertisenegative 0.0546442 0.1206358 0.0000000 0.000 1.0000 4.3331605
Time_negative 0.0467875 0.0988836 0.0000000 0.000 1.0000 3.8977334
Bedside_negative 0.0299485 0.0944457 0.0000000 0.000 1.0000 6.6026214
Office_negative 0.0293538 0.0750176 0.0000000 0.000 0.6000 3.8922822
Cost_negative 0.0119775 0.0541032 0.0000000 0.000 1.0000 9.5204150
Average.words.per.review 50.9807692 47.9577896 41.0000000 0.000 737.0000 6.3078409
Rating 4.1334423 1.0470945 4.5000000 1.000 5.0000 -1.3967250
Positive_Proportion 0.7744462 0.3391379 1.0000000 0.000 1.0000 -1.3426505
Negative_Proportion 0.1945310 0.3203035 0.0000000 0.000 1.0000 1.5719105
Average.pos_score 0.6917569 0.2982497 0.7988333 0.000 0.9923 -1.4246449
Average.neg_score -0.2221033 0.3278596 0.0000000 -0.981 0.0000 -1.0272195
Overall_score 0.4696536 0.5107502 0.6907250 -0.981 0.9914 -1.1026585
## Multiple t-tests for RateMDs
a<- lapply(dcombine[,-c(1:2)], function(x) t.test(x ~ dcombine$Gender, var.equal = TRUE))

dcombine %>% #plot rating over Gender 
  ggplot(aes(x=Rating,fill=dcombine$Gender))+
  geom_density(alpha=.4,position="identity")+
  labs(title = "Density of Rating by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Positive_Proportion over Gender 
  ggplot(aes(x=Positive_Proportion,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Positive Proportion by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Negative_Proportion over Gender 
  ggplot(aes(x=Negative_Proportion,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Negative_Proportion by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot `Average neg_score` over Gender 
  ggplot(aes(x=Average.neg_score,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average neg_score` by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot `Average pos_score` over Gender 
  ggplot(aes(x=Average.pos_score,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average pos_score` by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Years of Experience over Gender 
  ggplot(aes(x=Years.of.Experience,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Years of Experience by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

dcombine %>% #plot Overall_score over Gender 
  ggplot(aes(x=Overall_score,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Overall_score by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter
## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.