setwd("~/R")
#import the data set
df <- read.csv("health_data100.csv")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stopwords)
##
## Attaching package: 'stopwords'
##
## The following object is masked from 'package:tm':
##
## stopwords
library(tidytext)
library(stringr)
library(wordcloud)
## Loading required package: RColorBrewer
library(textdata)
head(df)
## patient_id age sex ethnicity visit_date diagnosis
## 1 1 66 Male Other 9/25/2022 Angina
## 2 2 82 Female White 6/30/2022 COPD
## 3 3 42 Female White 11/3/2022 Asthma
## 4 4 35 Female White 1/10/2022 Hyperlipidemia
## 5 5 66 Male White 5/1/2022 Angina
## 6 6 64 Female White 3/30/2022 Back pain
word_counts <- df %>%
unnest_tokens(word, diagnosis) %>%
count(word, sort = TRUE)
word_counts %>%
filter(n > 1) %>% # Optional: filters out words that appear only once
ggplot(aes(x = n, y = reorder(word, n))) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Most Frequent Diagnosis Words",
x = "Frequency",
y = "Word") +
theme_minimal()