setwd("~/R")

#import the data set

df <- read.csv("health_data100.csv")

Library calls

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(stopwords)
## 
## Attaching package: 'stopwords'
## 
## The following object is masked from 'package:tm':
## 
##     stopwords
library(tidytext)
library(stringr)
library(wordcloud)
## Loading required package: RColorBrewer
library(textdata)

Preview the data

head(df)
##   patient_id age    sex ethnicity visit_date      diagnosis
## 1          1  66   Male     Other  9/25/2022         Angina
## 2          2  82 Female     White  6/30/2022           COPD
## 3          3  42 Female     White  11/3/2022         Asthma
## 4          4  35 Female     White  1/10/2022 Hyperlipidemia
## 5          5  66   Male     White   5/1/2022         Angina
## 6          6  64 Female     White  3/30/2022      Back pain

Unnest tokens and count words

word_counts <- df %>%
  unnest_tokens(word, diagnosis) %>%
  count(word, sort = TRUE)

Horizontal barplot of Diagnosis Words sorted by frequency

word_counts %>%
  filter(n > 1) %>% # Optional: filters out words that appear only once
  ggplot(aes(x = n, y = reorder(word, n))) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Most Frequent Diagnosis Words",
       x = "Frequency",
       y = "Word") +
  theme_minimal()