##Assignment
healthcare <- read.csv("C:/Users/murth/healthcare_dataset.csv")
#1. Print the structure
str(healthcare)
## 'data.frame': 10000 obs. of 15 variables:
## $ Name : chr "Tiffany Ramirez" "Ruben Burns" "Chad Byrd" "Antonio Frederick" ...
## $ Age : int 81 35 61 49 51 41 82 55 33 39 ...
## $ Gender : chr "Female" "Male" "Male" "Male" ...
## $ Blood.Type : chr "O-" "O+" "B-" "B-" ...
## $ Medical.Condition : chr "Diabetes" "Asthma" "Obesity" "Asthma" ...
## $ Date.of.Admission : chr "2022-11-17" "2023-06-01" "2019-01-09" "2020-05-02" ...
## $ Doctor : chr "Patrick Parker" "Diane Jackson" "Paul Baker" "Brian Chandler" ...
## $ Hospital : chr "Wallace-Hamilton" "Burke, Griffin and Cooper" "Walton LLC" "Garcia Ltd" ...
## $ Insurance.Provider: chr "Medicare" "UnitedHealthcare" "Medicare" "Medicare" ...
## $ Billing.Amount : num 37491 47304 36875 23303 18086 ...
## $ Room.Number : int 146 404 292 480 477 180 161 384 215 310 ...
## $ Admission.Type : chr "Elective" "Emergency" "Emergency" "Urgent" ...
## $ Discharge.Date : chr "2022-12-01" "2023-06-15" "2019-02-08" "2020-05-03" ...
## $ Medication : chr "Aspirin" "Lipitor" "Lipitor" "Penicillin" ...
## $ Test.Results : chr "Inconclusive" "Normal" "Normal" "Abnormal" ...
#2. List the variables in your dataset
names(healthcare)
## [1] "Name" "Age" "Gender"
## [4] "Blood.Type" "Medical.Condition" "Date.of.Admission"
## [7] "Doctor" "Hospital" "Insurance.Provider"
## [10] "Billing.Amount" "Room.Number" "Admission.Type"
## [13] "Discharge.Date" "Medication" "Test.Results"
#3. Print the top 15 rows of your dataset
head(healthcare, 15)
## Name Age Gender Blood.Type Medical.Condition
## 1 Tiffany Ramirez 81 Female O- Diabetes
## 2 Ruben Burns 35 Male O+ Asthma
## 3 Chad Byrd 61 Male B- Obesity
## 4 Antonio Frederick 49 Male B- Asthma
## 5 Mrs. Brandy Flowers 51 Male O- Arthritis
## 6 Patrick Parker 41 Male AB+ Arthritis
## 7 Charles Horton 82 Male AB+ Hypertension
## 8 Patty Norman 55 Female O- Arthritis
## 9 Ryan Hayes 33 Male A+ Diabetes
## 10 Sharon Perez 39 Female O- Asthma
## 11 Amy Roberts 45 Male B- Cancer
## 12 Mrs. Caroline Farrell 23 Female O- Hypertension
## 13 Christina Williams 85 Female A+ Diabetes
## 14 William Page 72 Female A+ Diabetes
## 15 Michael Bradshaw 65 Female AB+ Cancer
## Date.of.Admission Doctor Hospital
## 1 2022-11-17 Patrick Parker Wallace-Hamilton
## 2 2023-06-01 Diane Jackson Burke, Griffin and Cooper
## 3 2019-01-09 Paul Baker Walton LLC
## 4 2020-05-02 Brian Chandler Garcia Ltd
## 5 2021-07-09 Dustin Griffin Jones, Brown and Murray
## 6 2020-08-20 Robin Green Boyd PLC
## 7 2021-03-22 Patricia Bishop Wheeler, Bryant and Johns
## 8 2019-05-16 Brian Kennedy Brown Inc
## 9 2020-12-17 Kristin Dunn Smith, Edwards and Obrien
## 10 2022-12-15 Jessica Bailey Brown-Golden
## 11 2021-04-13 Anthony Roberts Little-Spencer
## 12 2019-06-09 William Miller Rose Inc
## 13 2021-11-29 Laura Roberts Malone, Thompson and Mejia
## 14 2021-07-29 James Carney Richardson-Powell
## 15 2021-06-05 Katherine Lowe Castaneda-Hardy
## Insurance.Provider Billing.Amount Room.Number Admission.Type Discharge.Date
## 1 Medicare 37490.983 146 Elective 2022-12-01
## 2 UnitedHealthcare 47304.065 404 Emergency 2023-06-15
## 3 Medicare 36874.897 292 Emergency 2019-02-08
## 4 Medicare 23303.322 480 Urgent 2020-05-03
## 5 UnitedHealthcare 18086.344 477 Urgent 2021-08-02
## 6 Aetna 22522.363 180 Urgent 2020-08-23
## 7 Cigna 39593.436 161 Urgent 2021-04-15
## 8 Blue Cross 13546.817 384 Elective 2019-06-02
## 9 Aetna 24903.037 215 Elective 2020-12-22
## 10 Blue Cross 22788.236 310 Urgent 2022-12-16
## 11 Aetna 40325.071 306 Emergency 2021-05-11
## 12 Medicare 6185.904 126 Emergency 2019-06-26
## 13 Aetna 4835.946 444 Elective 2021-12-14
## 14 Cigna 13669.378 492 Elective 2021-08-14
## 15 Cigna 10342.836 120 Emergency 2021-06-25
## Medication Test.Results
## 1 Aspirin Inconclusive
## 2 Lipitor Normal
## 3 Lipitor Normal
## 4 Penicillin Abnormal
## 5 Paracetamol Normal
## 6 Aspirin Abnormal
## 7 Lipitor Abnormal
## 8 Aspirin Normal
## 9 Aspirin Abnormal
## 10 Aspirin Normal
## 11 Penicillin Abnormal
## 12 Paracetamol Inconclusive
## 13 Aspirin Inconclusive
## 14 Aspirin Normal
## 15 Ibuprofen Inconclusive
#4. Write a user defined function using any of the variables from the data set.
# Created classify_age function and used case_when statement to define
# the age values, mutate() to create new column while assigning values
classify_age <- function(age) {
case_when(
age <= 12 ~ "Child",
age > 12 & age <= 19 ~ "Teenager",
age > 19 & age <= 64 ~ "Adult",
age > 64 ~ "Senior",
TRUE ~ "NA"
)
}
healthcare <- healthcare %>%
mutate(Age_Classification = classify_age(Age))
head(healthcare, 5)
## Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1 Tiffany Ramirez 81 Female O- Diabetes 2022-11-17
## 2 Ruben Burns 35 Male O+ Asthma 2023-06-01
## 3 Chad Byrd 61 Male B- Obesity 2019-01-09
## 4 Antonio Frederick 49 Male B- Asthma 2020-05-02
## 5 Mrs. Brandy Flowers 51 Male O- Arthritis 2021-07-09
## Doctor Hospital Insurance.Provider Billing.Amount
## 1 Patrick Parker Wallace-Hamilton Medicare 37490.98
## 2 Diane Jackson Burke, Griffin and Cooper UnitedHealthcare 47304.06
## 3 Paul Baker Walton LLC Medicare 36874.90
## 4 Brian Chandler Garcia Ltd Medicare 23303.32
## 5 Dustin Griffin Jones, Brown and Murray UnitedHealthcare 18086.34
## Room.Number Admission.Type Discharge.Date Medication Test.Results
## 1 146 Elective 2022-12-01 Aspirin Inconclusive
## 2 404 Emergency 2023-06-15 Lipitor Normal
## 3 292 Emergency 2019-02-08 Lipitor Normal
## 4 480 Urgent 2020-05-03 Penicillin Abnormal
## 5 477 Urgent 2021-08-02 Paracetamol Normal
## Age_Classification
## 1 Senior
## 2 Adult
## 3 Adult
## 4 Adult
## 5 Adult
#5. Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset
#Created new dataframe and filtered Seniors and Emergency admission type from the main dataframe, and assigned it t new one
Healthcare_emergency_senior <- as.data.frame(healthcare %>%
filter(Admission.Type == "Emergency", Age_Classification == "Senior"
))
head(Healthcare_emergency_senior, 10)
## Name Age Gender Blood.Type Medical.Condition
## 1 Michael Bradshaw 65 Female AB+ Cancer
## 2 Sally Shaw 80 Male O- Arthritis
## 3 Francis Newman 74 Female AB+ Hypertension
## 4 Rodney Maynard 81 Male AB- Obesity
## 5 Jennifer Rodriguez 83 Female AB- Asthma
## 6 Anna Adams 70 Female A+ Obesity
## 7 Mr. Christopher Miller 81 Male AB+ Hypertension
## 8 Christina Hernandez 84 Female O+ Hypertension
## 9 John Griffin 78 Female B- Cancer
## 10 Gabrielle Russell 68 Male AB+ Cancer
## Date.of.Admission Doctor Hospital
## 1 2021-06-05 Katherine Lowe Castaneda-Hardy
## 2 2019-07-10 Zachary Horton DDS Rush, Owens and Johnson
## 3 2021-05-25 Carolyn Baker MD Hess-Lowe
## 4 2023-07-09 Dr. Kyle Dickson Nixon, Evans and Bradley
## 5 2019-11-12 Rachel Sullivan Duke Group
## 6 2022-09-06 Samuel Taylor Snow Group
## 7 2021-08-15 Amber Gonzalez Ortega and Sons
## 8 2019-10-13 Cody Wright Harris and Sons
## 9 2020-04-03 Kelsey Clark Aguirre LLC
## 10 2020-02-29 Tina Rogers Berry-Gallagher
## Insurance.Provider Billing.Amount Room.Number Admission.Type Discharge.Date
## 1 Cigna 10342.836 120 Emergency 2021-06-25
## 2 Blue Cross 16609.312 366 Emergency 2019-08-07
## 3 Cigna 12680.731 298 Emergency 2021-06-08
## 4 Blue Cross 21001.275 197 Emergency 2023-07-17
## 5 Cigna 5098.663 192 Emergency 2019-11-24
## 6 Medicare 1000.181 258 Emergency 2022-09-18
## 7 Cigna 22519.336 465 Emergency 2021-09-06
## 8 Aetna 10803.730 388 Emergency 2019-10-29
## 9 Aetna 25948.507 359 Emergency 2020-04-06
## 10 Medicare 2628.948 380 Emergency 2020-03-12
## Medication Test.Results Age_Classification
## 1 Ibuprofen Inconclusive Senior
## 2 Ibuprofen Inconclusive Senior
## 3 Lipitor Abnormal Senior
## 4 Lipitor Normal Senior
## 5 Lipitor Abnormal Senior
## 6 Paracetamol Abnormal Senior
## 7 Lipitor Inconclusive Senior
## 8 Aspirin Inconclusive Senior
## 9 Lipitor Normal Senior
## 10 Paracetamol Normal Senior
#6. Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
Billing_details <- cbind(healthcare$Age , healthcare$Medical.Condition, healthcare$Admission.Type, healthcare$Billing.Amount)
Billing_details <- as.data.frame(Billing_details)
Billing_details <- rename (Billing_details, Age = V1 ,Medical.Condition = V2, Admission.Type = V3, Billing.Amount = V4 )
head(Billing_details, 5)
## Age Medical.Condition Admission.Type Billing.Amount
## 1 81 Diabetes Elective 37490.9833635282
## 2 35 Asthma Emergency 47304.0648454751
## 3 61 Obesity Emergency 36874.8969966128
## 4 49 Asthma Urgent 23303.3220921969
## 5 51 Arthritis Urgent 18086.3441835639
#7. Remove missing values in your dataset.
#import dataset
Medical_data <- read.csv("C:/Users/murth/Medical_data.csv")
#head(Medical_data,10)
#Filter missing values
Medical_data %>% filter(is.na(Age))
## Patient_ID Age Gender Blood_Pressure Cholesterol BMI Smoker Diagnosis
## 1 P1012 NA MALE 160 180 27.8 Yes Diabetes
## 2 P1016 NA Male 120 180 22.5 No nan
## 3 P1017 NA Female 130 NA 27.8 Yes Diabetes
## 4 P1024 NA MALE 140 220 22.5 N Heart Disease
## 5 P1034 NA Male 120 NA 35.4 Yes None
## 6 P1035 NA FEMALE NA 220 35.4 No None
## 7 P1036 NA male 160 220 35.4 Y Heart Disease
## 8 P1050 NA FEMALE 150 180 30.0 No Heart Disease
## 9 P1063 NA nan 140 180 22.5 Yes Diabetes
## 10 P1069 NA Female 130 250 NA nan None
## 11 P1070 NA Female 140 300 22.5 Yes Diabetes
## 12 P1071 NA Female 150 300 40.1 Y Diabetes
## 13 P1074 NA MALE 150 200 40.1 N Heart Disease
## 14 P1081 NA Male 160 200 30.0 Yes None
## 15 P1083 NA nan 140 200 NA Yes Heart Disease
## 16 P1084 NA Male 120 220 22.5 Yes Heart Disease
## 17 P1087 NA Male 140 200 35.4 Yes Heart Disease
## Admission_Date Notes
## 1 2023-06-22
## 2 2023-03-21 Checkup due
## 3 2023-08-01 Follow-up required
## 4 2023-01-20 Follow-up required
## 5 2023-01-12 Follow-up required
## 6 2023-11-26 N/A
## 7 2023-08-12 N/A
## 8 2023-08-08 N/A
## 9 2023-03-23
## 10 2023-10-13 N/A
## 11 2023-08-30 Checkup due
## 12 2023-09-08 N/A
## 13 2023-04-29 Checkup due
## 14 2023-09-24 Follow-up required
## 15 2023-10-09 N/A
## 16 2023-03-04
## 17 2023-11-11 Pending
#Remove missing values in Age
is.na(Medical_data$Age)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [37] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [73] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
## [85] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE
na.omit(Medical_data)
## Patient_ID Age Gender Blood_Pressure Cholesterol BMI Smoker Diagnosis
## 2 P1001 65 Male 150 180 27.8 nan None
## 3 P1002 45 Male 120 220 22.5 No nan
## 5 P1004 65 Male 120 300 40.1 nan None
## 6 P1005 35 Female 130 200 35.4 N Heart Disease
## 11 P1010 55 MALE 140 220 30.0 Y None
## 12 P1011 45 Male 120 300 35.4 No nan
## 14 P1013 65 FEMALE 120 250 27.8 nan nan
## 16 P1015 55 male 140 250 30.0 N Diabetes
## 20 P1019 55 Female 150 250 40.1 Y nan
## 22 P1021 25 Male 160 180 27.8 N None
## 23 P1022 55 nan 120 180 27.8 No Diabetes
## 33 P1032 55 nan 140 300 22.5 Y None
## 34 P1033 55 MALE 140 220 40.1 Yes Diabetes
## 38 P1037 45 nan 150 200 40.1 N None
## 39 P1038 55 MALE 130 300 35.4 nan nan
## 42 P1041 45 male 140 250 35.4 No Heart Disease
## 44 P1043 45 FEMALE 120 300 30.0 Y Diabetes
## 45 P1044 65 MALE 150 250 35.4 No Diabetes
## 46 P1045 25 Male 140 300 40.1 Y Heart Disease
## 48 P1047 55 FEMALE 160 250 35.4 No Heart Disease
## 49 P1048 25 Female 140 220 27.8 nan Diabetes
## 52 P1051 35 FEMALE 140 250 22.5 No Diabetes
## 53 P1052 35 MALE 150 250 30.0 No Heart Disease
## 55 P1054 35 Female 130 300 27.8 Yes Heart Disease
## 57 P1056 35 MALE 140 300 40.1 No Diabetes
## 59 P1058 55 nan 150 220 40.1 No Diabetes
## 61 P1060 55 male 120 180 22.5 Y None
## 62 P1061 65 male 130 300 35.4 nan nan
## 63 P1062 45 FEMALE 120 300 40.1 N nan
## 66 P1065 55 FEMALE 120 300 27.8 nan Heart Disease
## 67 P1066 35 nan 120 220 22.5 Yes nan
## 68 P1067 55 Female 130 250 27.8 N Diabetes
## 73 P1072 35 nan 130 180 40.1 Yes nan
## 74 P1073 55 nan 120 220 22.5 Y nan
## 76 P1075 65 male 150 180 40.1 No nan
## 79 P1078 55 nan 120 200 30.0 N Diabetes
## 86 P1085 55 Female 120 200 40.1 Yes None
## 89 P1088 65 MALE 140 180 40.1 nan None
## 90 P1089 65 nan 150 180 22.5 Y Diabetes
## 94 P1093 25 FEMALE 140 300 35.4 N Heart Disease
## 95 P1094 55 Male 120 200 40.1 nan Heart Disease
## 96 P1095 55 MALE 150 200 35.4 Yes nan
## 98 P1097 65 Female 150 200 27.8 N nan
## 100 P1099 65 nan 140 300 35.4 N None
## Admission_Date Notes
## 2 2023-01-01
## 3 2023-12-14
## 5 2023-07-10 Follow-up required
## 6 2023-11-08 Follow-up required
## 11 2023-06-22 Checkup due
## 12 2023-06-09 N/A
## 14 2023-11-13
## 16 2023-05-29 Checkup due
## 20 2023-09-09 Follow-up required
## 22 2023-06-13 Follow-up required
## 23 2023-08-15 N/A
## 33 2023-09-16
## 34 2023-05-24
## 38 2023-09-29 Pending
## 39 2023-12-24 Follow-up required
## 42 2023-05-02 Follow-up required
## 44 2023-06-13 N/A
## 45 2023-03-31 Follow-up required
## 46 2023-05-16 Pending
## 48 2023-11-12 Checkup due
## 49 2023-06-27 Pending
## 52 2023-12-23 Checkup due
## 53 2023-02-10
## 55 2023-07-11 Pending
## 57 2023-07-20 Checkup due
## 59 2023-08-08 Checkup due
## 61 2023-02-15 Pending
## 62 2023-02-04
## 63 2023-09-10 Follow-up required
## 66 2023-07-16
## 67 2023-10-30 N/A
## 68 2023-01-10 N/A
## 73 2023-12-31 Pending
## 74 2023-01-05 Pending
## 76 2023-10-16 N/A
## 79 2023-08-12
## 86 2023-08-05
## 89 2023-10-10 Follow-up required
## 90 2023-06-26
## 94 2023-11-07 Checkup due
## 95 2023-01-06 Pending
## 96 2023-07-16 Follow-up required
## 98 2023-05-13 Checkup due
## 100 2023-09-16
Medical_data_missing_removed = na.omit(Medical_data)
print(Medical_data_missing_removed)
## Patient_ID Age Gender Blood_Pressure Cholesterol BMI Smoker Diagnosis
## 2 P1001 65 Male 150 180 27.8 nan None
## 3 P1002 45 Male 120 220 22.5 No nan
## 5 P1004 65 Male 120 300 40.1 nan None
## 6 P1005 35 Female 130 200 35.4 N Heart Disease
## 11 P1010 55 MALE 140 220 30.0 Y None
## 12 P1011 45 Male 120 300 35.4 No nan
## 14 P1013 65 FEMALE 120 250 27.8 nan nan
## 16 P1015 55 male 140 250 30.0 N Diabetes
## 20 P1019 55 Female 150 250 40.1 Y nan
## 22 P1021 25 Male 160 180 27.8 N None
## 23 P1022 55 nan 120 180 27.8 No Diabetes
## 33 P1032 55 nan 140 300 22.5 Y None
## 34 P1033 55 MALE 140 220 40.1 Yes Diabetes
## 38 P1037 45 nan 150 200 40.1 N None
## 39 P1038 55 MALE 130 300 35.4 nan nan
## 42 P1041 45 male 140 250 35.4 No Heart Disease
## 44 P1043 45 FEMALE 120 300 30.0 Y Diabetes
## 45 P1044 65 MALE 150 250 35.4 No Diabetes
## 46 P1045 25 Male 140 300 40.1 Y Heart Disease
## 48 P1047 55 FEMALE 160 250 35.4 No Heart Disease
## 49 P1048 25 Female 140 220 27.8 nan Diabetes
## 52 P1051 35 FEMALE 140 250 22.5 No Diabetes
## 53 P1052 35 MALE 150 250 30.0 No Heart Disease
## 55 P1054 35 Female 130 300 27.8 Yes Heart Disease
## 57 P1056 35 MALE 140 300 40.1 No Diabetes
## 59 P1058 55 nan 150 220 40.1 No Diabetes
## 61 P1060 55 male 120 180 22.5 Y None
## 62 P1061 65 male 130 300 35.4 nan nan
## 63 P1062 45 FEMALE 120 300 40.1 N nan
## 66 P1065 55 FEMALE 120 300 27.8 nan Heart Disease
## 67 P1066 35 nan 120 220 22.5 Yes nan
## 68 P1067 55 Female 130 250 27.8 N Diabetes
## 73 P1072 35 nan 130 180 40.1 Yes nan
## 74 P1073 55 nan 120 220 22.5 Y nan
## 76 P1075 65 male 150 180 40.1 No nan
## 79 P1078 55 nan 120 200 30.0 N Diabetes
## 86 P1085 55 Female 120 200 40.1 Yes None
## 89 P1088 65 MALE 140 180 40.1 nan None
## 90 P1089 65 nan 150 180 22.5 Y Diabetes
## 94 P1093 25 FEMALE 140 300 35.4 N Heart Disease
## 95 P1094 55 Male 120 200 40.1 nan Heart Disease
## 96 P1095 55 MALE 150 200 35.4 Yes nan
## 98 P1097 65 Female 150 200 27.8 N nan
## 100 P1099 65 nan 140 300 35.4 N None
## Admission_Date Notes
## 2 2023-01-01
## 3 2023-12-14
## 5 2023-07-10 Follow-up required
## 6 2023-11-08 Follow-up required
## 11 2023-06-22 Checkup due
## 12 2023-06-09 N/A
## 14 2023-11-13
## 16 2023-05-29 Checkup due
## 20 2023-09-09 Follow-up required
## 22 2023-06-13 Follow-up required
## 23 2023-08-15 N/A
## 33 2023-09-16
## 34 2023-05-24
## 38 2023-09-29 Pending
## 39 2023-12-24 Follow-up required
## 42 2023-05-02 Follow-up required
## 44 2023-06-13 N/A
## 45 2023-03-31 Follow-up required
## 46 2023-05-16 Pending
## 48 2023-11-12 Checkup due
## 49 2023-06-27 Pending
## 52 2023-12-23 Checkup due
## 53 2023-02-10
## 55 2023-07-11 Pending
## 57 2023-07-20 Checkup due
## 59 2023-08-08 Checkup due
## 61 2023-02-15 Pending
## 62 2023-02-04
## 63 2023-09-10 Follow-up required
## 66 2023-07-16
## 67 2023-10-30 N/A
## 68 2023-01-10 N/A
## 73 2023-12-31 Pending
## 74 2023-01-05 Pending
## 76 2023-10-16 N/A
## 79 2023-08-12
## 86 2023-08-05
## 89 2023-10-10 Follow-up required
## 90 2023-06-26
## 94 2023-11-07 Checkup due
## 95 2023-01-06 Pending
## 96 2023-07-16 Follow-up required
## 98 2023-05-13 Checkup due
## 100 2023-09-16
#8. Identify and remove duplicated data in your dataset
#Remove duplicates based on healthcare - Name variable
Healthcare_unique = healthcare %>% distinct(healthcare$Name)
Healthcare_unique = healthcare %>% distinct(healthcare$Name, .keep_all = TRUE)
head(Healthcare_unique, 10)
## Name Age Gender Blood.Type Medical.Condition
## 1 Tiffany Ramirez 81 Female O- Diabetes
## 2 Ruben Burns 35 Male O+ Asthma
## 3 Chad Byrd 61 Male B- Obesity
## 4 Antonio Frederick 49 Male B- Asthma
## 5 Mrs. Brandy Flowers 51 Male O- Arthritis
## 6 Patrick Parker 41 Male AB+ Arthritis
## 7 Charles Horton 82 Male AB+ Hypertension
## 8 Patty Norman 55 Female O- Arthritis
## 9 Ryan Hayes 33 Male A+ Diabetes
## 10 Sharon Perez 39 Female O- Asthma
## Date.of.Admission Doctor Hospital
## 1 2022-11-17 Patrick Parker Wallace-Hamilton
## 2 2023-06-01 Diane Jackson Burke, Griffin and Cooper
## 3 2019-01-09 Paul Baker Walton LLC
## 4 2020-05-02 Brian Chandler Garcia Ltd
## 5 2021-07-09 Dustin Griffin Jones, Brown and Murray
## 6 2020-08-20 Robin Green Boyd PLC
## 7 2021-03-22 Patricia Bishop Wheeler, Bryant and Johns
## 8 2019-05-16 Brian Kennedy Brown Inc
## 9 2020-12-17 Kristin Dunn Smith, Edwards and Obrien
## 10 2022-12-15 Jessica Bailey Brown-Golden
## Insurance.Provider Billing.Amount Room.Number Admission.Type Discharge.Date
## 1 Medicare 37490.98 146 Elective 2022-12-01
## 2 UnitedHealthcare 47304.06 404 Emergency 2023-06-15
## 3 Medicare 36874.90 292 Emergency 2019-02-08
## 4 Medicare 23303.32 480 Urgent 2020-05-03
## 5 UnitedHealthcare 18086.34 477 Urgent 2021-08-02
## 6 Aetna 22522.36 180 Urgent 2020-08-23
## 7 Cigna 39593.44 161 Urgent 2021-04-15
## 8 Blue Cross 13546.82 384 Elective 2019-06-02
## 9 Aetna 24903.04 215 Elective 2020-12-22
## 10 Blue Cross 22788.24 310 Urgent 2022-12-16
## Medication Test.Results Age_Classification healthcare$Name
## 1 Aspirin Inconclusive Senior Tiffany Ramirez
## 2 Lipitor Normal Adult Ruben Burns
## 3 Lipitor Normal Adult Chad Byrd
## 4 Penicillin Abnormal Adult Antonio Frederick
## 5 Paracetamol Normal Adult Mrs. Brandy Flowers
## 6 Aspirin Abnormal Adult Patrick Parker
## 7 Lipitor Abnormal Senior Charles Horton
## 8 Aspirin Normal Adult Patty Norman
## 9 Aspirin Abnormal Adult Ryan Hayes
## 10 Aspirin Normal Adult Sharon Perez
# Remove duplicates based on medical condition
Healthcare_unique1 = healthcare %>% distinct(healthcare$Medical.Condition, .keep_all = TRUE)
head(Healthcare_unique1, 5)
## Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1 Tiffany Ramirez 81 Female O- Diabetes 2022-11-17
## 2 Ruben Burns 35 Male O+ Asthma 2023-06-01
## 3 Chad Byrd 61 Male B- Obesity 2019-01-09
## 4 Mrs. Brandy Flowers 51 Male O- Arthritis 2021-07-09
## 5 Charles Horton 82 Male AB+ Hypertension 2021-03-22
## Doctor Hospital Insurance.Provider Billing.Amount
## 1 Patrick Parker Wallace-Hamilton Medicare 37490.98
## 2 Diane Jackson Burke, Griffin and Cooper UnitedHealthcare 47304.06
## 3 Paul Baker Walton LLC Medicare 36874.90
## 4 Dustin Griffin Jones, Brown and Murray UnitedHealthcare 18086.34
## 5 Patricia Bishop Wheeler, Bryant and Johns Cigna 39593.44
## Room.Number Admission.Type Discharge.Date Medication Test.Results
## 1 146 Elective 2022-12-01 Aspirin Inconclusive
## 2 404 Emergency 2023-06-15 Lipitor Normal
## 3 292 Emergency 2019-02-08 Lipitor Normal
## 4 477 Urgent 2021-08-02 Paracetamol Normal
## 5 161 Urgent 2021-04-15 Lipitor Abnormal
## Age_Classification healthcare$Medical.Condition
## 1 Senior Diabetes
## 2 Adult Asthma
## 3 Adult Obesity
## 4 Adult Arthritis
## 5 Senior Hypertension
#9. Reorder multiple rows in descending order
# Reorder rows in descending order by Billing Amount
Healthcare_sorted = as.data.frame(healthcare %>% arrange(desc(Billing.Amount)))
head(Healthcare_sorted, 10)
## Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1 Daniel Hall 77 Male A+ Hypertension 2018-12-04
## 2 Teresa Buchanan 75 Male B+ Cancer 2020-11-22
## 3 Roy Beck 79 Female A- Arthritis 2020-11-05
## 4 Mary Stein 35 Male A+ Hypertension 2020-12-07
## 5 Richard Jones 59 Female A+ Asthma 2020-05-16
## 6 Holly Clayton 43 Male O- Obesity 2021-11-30
## 7 Jason Miller 74 Male B- Cancer 2019-01-01
## 8 John Oneill 31 Male AB- Diabetes 2019-08-16
## 9 Elizabeth Johnson 50 Female A- Hypertension 2022-07-10
## 10 Robert Potts 67 Female A- Obesity 2019-01-17
## Doctor Hospital Insurance.Provider
## 1 Timothy Serrano Arellano-Mahoney Aetna
## 2 Joseph Rice Ellison-Johnson UnitedHealthcare
## 3 Aaron Mills Thompson, Carlson and Kim Medicare
## 4 Alice Gross DVM Morales, Ferrell and Clark Medicare
## 5 Rebecca Parks Smith, Cooper and Chavez Aetna
## 6 Zachary Castaneda Webster, Oconnell and Norton Medicare
## 7 Matthew Lewis Ford, Gibson and Parker Blue Cross
## 8 Travis Gibbs Dunn Ltd Aetna
## 9 Dawn Haley Sanders, Robertson and Williams Medicare
## 10 William Wilson Harmon-Anderson Blue Cross
## Billing.Amount Room.Number Admission.Type Discharge.Date Medication
## 1 49995.90 196 Emergency 2018-12-11 Paracetamol
## 2 49994.98 296 Urgent 2020-11-28 Aspirin
## 3 49985.97 483 Elective 2020-11-12 Paracetamol
## 4 49974.81 390 Emergency 2020-12-28 Lipitor
## 5 49974.30 242 Emergency 2020-05-31 Aspirin
## 6 49974.16 441 Emergency 2021-12-16 Penicillin
## 7 49958.00 292 Emergency 2019-01-29 Ibuprofen
## 8 49954.97 455 Emergency 2019-09-03 Paracetamol
## 9 49951.26 476 Emergency 2022-08-06 Aspirin
## 10 49947.56 154 Emergency 2019-01-19 Aspirin
## Test.Results Age_Classification
## 1 Abnormal Senior
## 2 Inconclusive Senior
## 3 Inconclusive Senior
## 4 Normal Adult
## 5 Abnormal Adult
## 6 Abnormal Adult
## 7 Inconclusive Senior
## 8 Abnormal Adult
## 9 Inconclusive Adult
## 10 Inconclusive Senior
#10. Rename some of the column names in your dataset
# Create a new dataframe using cbind() with selected columns
Healthcare_New = cbind(healthcare$Name, healthcare$Age, healthcare$Billing.Amount, healthcare$Medical.Condition)
Healthcare_New = as.data.frame(Healthcare_New)
names(Healthcare_New)
## [1] "V1" "V2" "V3" "V4"
Healthcare_New <- rename (Healthcare_New, Patient_Name = V1 ,Patient_Age = V2, Billing_Amount = V3, Medical_Condition = V4 )
names(Healthcare_New)
## [1] "Patient_Name" "Patient_Age" "Billing_Amount"
## [4] "Medical_Condition"
#11. Add new variables in your data frame by using a mathematical function (for e.g. – multiply an existing column by 2 and add it as a new variable to your data frame)
#20% discount on billing amount for senior
Healthcare_discount = healthcare %>%
mutate(Discounted_Billing_Amount =
case_when( Age_Classification == "Senior" ~ Billing.Amount * 0.80,
TRUE ~ Billing.Amount
))
head(Healthcare_discount, 5)
## Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1 Tiffany Ramirez 81 Female O- Diabetes 2022-11-17
## 2 Ruben Burns 35 Male O+ Asthma 2023-06-01
## 3 Chad Byrd 61 Male B- Obesity 2019-01-09
## 4 Antonio Frederick 49 Male B- Asthma 2020-05-02
## 5 Mrs. Brandy Flowers 51 Male O- Arthritis 2021-07-09
## Doctor Hospital Insurance.Provider Billing.Amount
## 1 Patrick Parker Wallace-Hamilton Medicare 37490.98
## 2 Diane Jackson Burke, Griffin and Cooper UnitedHealthcare 47304.06
## 3 Paul Baker Walton LLC Medicare 36874.90
## 4 Brian Chandler Garcia Ltd Medicare 23303.32
## 5 Dustin Griffin Jones, Brown and Murray UnitedHealthcare 18086.34
## Room.Number Admission.Type Discharge.Date Medication Test.Results
## 1 146 Elective 2022-12-01 Aspirin Inconclusive
## 2 404 Emergency 2023-06-15 Lipitor Normal
## 3 292 Emergency 2019-02-08 Lipitor Normal
## 4 480 Urgent 2020-05-03 Penicillin Abnormal
## 5 477 Urgent 2021-08-02 Paracetamol Normal
## Age_Classification Discounted_Billing_Amount
## 1 Senior 29992.79
## 2 Adult 47304.06
## 3 Adult 36874.90
## 4 Adult 23303.32
## 5 Adult 18086.34
#12. Create a training set using random number generator engine.
# set.seed() must always be run before any random sampling
set.seed(1234)
#75 % training set
Healthcare_training75 = healthcare %>% sample_frac(.75, replace = FALSE)
str(Healthcare_training75)
## 'data.frame': 7500 obs. of 16 variables:
## $ Name : chr "Joseph Smith" "Timothy Ford" "Christopher Oconnell" "Gilbert Gordon" ...
## $ Age : int 30 26 77 77 74 82 21 63 76 70 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Blood.Type : chr "O+" "B-" "AB+" "A+" ...
## $ Medical.Condition : chr "Diabetes" "Asthma" "Asthma" "Obesity" ...
## $ Date.of.Admission : chr "2019-12-27" "2023-10-16" "2019-08-30" "2023-04-06" ...
## $ Doctor : chr "Rachel Baker" "Richard Ross" "Tyler Torres" "Sandra Johnson" ...
## $ Hospital : chr "Archer Ltd" "Lynch-Martin" "Davis Ltd" "Ayala, Hernandez and Collins" ...
## $ Insurance.Provider: chr "Blue Cross" "Aetna" "Blue Cross" "UnitedHealthcare" ...
## $ Billing.Amount : num 7843 7005 15067 20940 43028 ...
## $ Room.Number : int 494 179 324 222 396 104 245 345 289 263 ...
## $ Admission.Type : chr "Emergency" "Emergency" "Emergency" "Emergency" ...
## $ Discharge.Date : chr "2020-01-01" "2023-11-05" "2019-09-17" "2023-04-27" ...
## $ Medication : chr "Lipitor" "Ibuprofen" "Paracetamol" "Penicillin" ...
## $ Test.Results : chr "Abnormal" "Normal" "Inconclusive" "Abnormal" ...
## $ Age_Classification: chr "Adult" "Adult" "Senior" "Senior" ...
set.seed(1234) #75 % training set Healthcare_training75 = healthcare %>% sample_frac(.75, replace = FALSE) str(Healthcare_training75)
#13. Print the summary statistics of your dataset
summary(healthcare)
## Name Age Gender Blood.Type
## Length :10000 Min. :18.00 Length :10000 Length :10000
## N.unique : 9378 1st Qu.:35.00 N.unique : 2 N.unique : 8
## N.blank : 0 Median :52.00 N.blank : 0 N.blank : 0
## Min.nchar: 7 Mean :51.45 Min.nchar: 4 Min.nchar: 2
## Max.nchar: 25 3rd Qu.:68.00 Max.nchar: 6 Max.nchar: 3
## Max. :85.00
## Medical.Condition Date.of.Admission Doctor Hospital
## Length :10000 Length :10000 Length :10000 Length :10000
## N.unique : 6 N.unique : 1815 N.unique : 9416 N.unique : 8639
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 6 Min.nchar: 10 Min.nchar: 6 Min.nchar: 6
## Max.nchar: 12 Max.nchar: 10 Max.nchar: 26 Max.nchar: 34
##
## Insurance.Provider Billing.Amount Room.Number Admission.Type
## Length :10000 Min. : 1000 Min. :101.0 Length :10000
## N.unique : 5 1st Qu.:13507 1st Qu.:199.0 N.unique : 3
## N.blank : 0 Median :25258 Median :299.0 N.blank : 0
## Min.nchar: 5 Mean :25517 Mean :300.1 Min.nchar: 6
## Max.nchar: 16 3rd Qu.:37734 3rd Qu.:400.0 Max.nchar: 9
## Max. :49996 Max. :500.0
## Discharge.Date Medication Test.Results Age_Classification
## Length :10000 Length :10000 Length :10000 Length :10000
## N.unique : 1834 N.unique : 5 N.unique : 3 N.unique : 3
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 10 Min.nchar: 7 Min.nchar: 6 Min.nchar: 5
## Max.nchar: 10 Max.nchar: 11 Max.nchar: 12 Max.nchar: 8
##
#14. Use any of the numerical variables from the dataset and perform the following statistical functions • Mean , • Median , • Mode , • Range
#Using Billing.Amount as the numerical variable
mean(healthcare$Billing.Amount)
## [1] 25516.81
# Median
median(healthcare$Billing.Amount)
## [1] 25258.11
#Mode
# Mode - no built-in function in R, took reference from https://www.geeksforgeeks.org/r-language/mean-median-and-mode-in-r-programming/
get_mode <- function(x){
unique_x <- unique(x)
unique_x[which.max(tabulate(match(x, unique_x)))]
}
get_mode(healthcare$Billing.Amount)
## [1] 37490.98
# Range
range(healthcare$Billing.Amount)
## [1] 1000.181 49995.902
#15. Plot a scatter plot for any 2 variables in your dataset
ggplot(data = healthcare, aes(x = healthcare$Age,
y=healthcare$Billing.Amount,
col= healthcare$Gender )) + geom_point()
## Warning: Use of `healthcare$Age` is discouraged.
## ℹ Use `Age` instead.
## Warning: Use of `healthcare$Billing.Amount` is discouraged.
## ℹ Use `Billing.Amount` instead.
## Warning: Use of `healthcare$Gender` is discouraged.
## ℹ Use `Gender` instead.
#16. Plot a bar plot for any 2 variables in your dataset
ggplot(data = healthcare, aes(x = healthcare$Medical.Condition,
fill = healthcare$Age_Classification)) + geom_bar()
## Warning: Use of `healthcare$Medical.Condition` is discouraged.
## ℹ Use `Medical.Condition` instead.
## Warning: Use of `healthcare$Age_Classification` is discouraged.
## ℹ Use `Age_Classification` instead.
#17. Find the correlation between any 2 variables by applying Pearson correlation
cor(healthcare$Age, healthcare$Billing.Amount, method = "pearson")
## [1] -0.009483329