R learning session 1

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(readr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

norsjo <- read_csv("norsjo86.csv")

## New names:
## Rows: 260 Columns: 11
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): agegrp, sex, smoker dbl (8): ...1, health, height, weight, sbp, dbp,
## cholesterol, bmi
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

table(norsjo$sex)

## 
##   man woman 
##   128   132

#5. Frequency table
library(summarytools)

## 
## Attaching package: 'summarytools'
## 
## The following object is masked from 'package:tibble':
## 
##     view

freq(norsjo$sex)

## Frequencies  
## norsjo$sex  
## Type: Character  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##         man    128     49.23          49.23     49.23          49.23
##       woman    132     50.77         100.00     50.77         100.00
##        <NA>      0                               0.00         100.00
##       Total    260    100.00         100.00    100.00         100.00

freq(norsjo$sex,
     report.nas = FALSE, # remove NA (missing values) information
     totals = FALSE, # remove totals
     cumul = FALSE, # remove cumulative
     headings = FALSE) # remove heading

## 
##               Freq       %
## ----------- ------ -------
##         man    128   49.23
##       woman    132   50.77

norsjo$bmi_groups<-cut(norsjo$bmi,breaks=c(0,25,30,Inf), labels=c("normal","overweight","obese"))        
norsjo$hypertonia <- 0
norsjo$hypertonia<- as.factor(ifelse(norsjo$sbp>140 | norsjo$dbp>90, "yes", "no"))
?ctable

## starting httpd help server ... done

ctable(x = norsjo$bmi_groups, y = norsjo$hypertonia, prop = 'r')

## Cross-Tabulation, Row Proportions  
## bmi_groups * hypertonia  
## Data Frame: norsjo  
## 
## ------------ ------------ ------------- ------------ ----------- --------------
##                hypertonia            no          yes        <NA>          Total
##   bmi_groups                                                                   
##       normal                124 (86.1%)   19 (13.2%)   1 ( 0.7%)   144 (100.0%)
##   overweight                 60 (70.6%)   25 (29.4%)   0 ( 0.0%)    85 (100.0%)
##        obese                 12 (44.4%)   15 (55.6%)   0 ( 0.0%)    27 (100.0%)
##         <NA>                  1 (25.0%)    1 (25.0%)   2 (50.0%)     4 (100.0%)
##        Total                197 (75.8%)   60 (23.1%)   3 ( 1.2%)   260 (100.0%)
## ------------ ------------ ------------- ------------ ----------- --------------

ctable(x = norsjo$bmi_groups, y = norsjo$hypertonia, prop = 'r', useNA = 'no')

## Cross-Tabulation, Row Proportions  
## bmi_groups * hypertonia  
## Data Frame: norsjo  
## 
## ------------ ------------ ------------- ------------ --------------
##                hypertonia            no          yes          Total
##   bmi_groups                                                       
##       normal                124 (86.7%)   19 (13.3%)   143 (100.0%)
##   overweight                 60 (70.6%)   25 (29.4%)    85 (100.0%)
##        obese                 12 (44.4%)   15 (55.6%)    27 (100.0%)
##        Total                196 (76.9%)   59 (23.1%)   255 (100.0%)
## ------------ ------------ ------------- ------------ --------------

ctable(x = norsjo$bmi_groups, y = norsjo$hypertonia, prop = 'c')

## Cross-Tabulation, Column Proportions  
## bmi_groups * hypertonia  
## Data Frame: norsjo  
## 
## ------------ ------------ -------------- ------------- ------------ --------------
##                hypertonia             no           yes         <NA>          Total
##   bmi_groups                                                                      
##       normal                124 ( 62.9%)   19 ( 31.7%)   1 ( 33.3%)   144 ( 55.4%)
##   overweight                 60 ( 30.5%)   25 ( 41.7%)   0 (  0.0%)    85 ( 32.7%)
##        obese                 12 (  6.1%)   15 ( 25.0%)   0 (  0.0%)    27 ( 10.4%)
##         <NA>                  1 (  0.5%)    1 (  1.7%)   2 ( 66.7%)     4 (  1.5%)
##        Total                197 (100.0%)   60 (100.0%)   3 (100.0%)   260 (100.0%)
## ------------ ------------ -------------- ------------- ------------ --------------

descr(norsjo, stats = c("mean", "sd"), transpose = TRUE, headings = FALSE)

## Non-numerical variable(s) ignored: agegrp, sex, smoker, bmi_groups, hypertonia

## 
##                       Mean   Std.Dev
## ----------------- -------- ---------
##              ...1   130.50     75.20
##               bmi    25.27      3.98
##       cholesterol     6.54      1.48
##               dbp    79.17     11.67
##            health     0.40      0.49
##            height   170.91      8.71
##               sbp   124.48     19.00
##            weight    73.89     12.88

mean(norsjo$bmi) # what happens here?

## [1] NA

summary(norsjo$bmi)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   16.71   22.58   24.52   25.27   27.28   45.03       4

mean(norsjo$bmi,na.rm=T) # na.rm=T helps

## [1] 25.26896

round(mean(norsjo$bmi,na.rm=T),2)

## [1] 25.27

stby(data = norsjo$bmi, INDICES = norsjo$sex, 
     FUN = descr, stats = "common", transpose = TRUE)

## Descriptive Statistics  
## bmi by sex  
## Data Frame: norsjo  
## N: 128  
## 
##                Mean   Std.Dev     Min   Median     Max   N.Valid   Pct.Valid
## ----------- ------- --------- ------- -------- ------- --------- -----------
##         man   25.32      2.94   19.44    24.96   35.15    125.00       97.66
##       woman   25.22      4.78   16.71    23.95   45.03    131.00       99.24

stby(data = norsjo$bmi, INDICES = norsjo$sex, 
     FUN = descr, stats = c("mean", "sd"), transpose = TRUE)

## Descriptive Statistics  
## bmi by sex  
## Data Frame: norsjo  
## N: 128  
## 
##                Mean   Std.Dev
## ----------- ------- ---------
##         man   25.32      2.94
##       woman   25.22      4.78

mean(norsjo$bmi,)

## [1] NA

?mean
# If you don't have the package installed, uncomment the following line:
# install.packages("dplyr")

library(dplyr)

# Mean of 'value' by 'group'
norsjo %>%  group_by(sex) %>%
  summarise(mean_value = mean(bmi,na.rm=T))

## # A tibble: 2 × 2
##   sex   mean_value
##   <chr>      <dbl>
## 1 man         25.3
## 2 woman       25.2

ggplot(norsjo, aes(x= bmi)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(norsjo, aes(x= bmi)) + geom_histogram(bins=10) # bins controls the number of intervals

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(norsjo, aes(x = bmi)) + geom_histogram(binwidth = 1, colour = "blue", fill = "aquamarine") # Changing the binwidth and colours

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(norsjo, aes(x = bmi)) + geom_histogram(aesbinwidth = 1, colour = "blue", fill = "aquamarine")

## Warning in geom_histogram(aesbinwidth = 1, colour = "blue", fill =
## "aquamarine"): Ignoring unknown parameters: `aesbinwidth`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(norsjo, aes(bmi)) + geom_histogram(aes(y = ..density..), bins=100,
                                          color="black", fill="dodgerblue") +
  labs(title="Simple Histogram with density line",
       x= "BMI")+
  theme(
    plot.title = element_text(color = "red", size = 20, 
                              face = "bold"),                       # Plot title
    axis.title = element_text(size = 18),             # Axis titles
    axis.text= element_text(size = 14))               # Axis text size

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## Removed 4 rows containing non-finite outside the scale range (`stat_bin()`).
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(norsjo, aes(bmi)) + geom_histogram(aes(y = ..density..), bins=100,
                                          color="black", fill="dodgerblue") +
  labs(title="Simple Histogram with density line",
       x= "BMI")+
  geom_density(col=3)

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).

ggplot(norsjo, aes(x=cholesterol, y=bmi)) + geom_point()

## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Install car package if not installed
#install.packages("car")

# Load the car package
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

# Simulate BMI data for women and men
set.seed(123)  # For reproducibility
women_bmi <- rnorm(364, mean = 23.0, sd = 4.17)
men_bmi <- rnorm(265, mean = 23.4, sd = 4.12)

# Combine the BMI data into a single vector
bmi <- c(women_bmi, men_bmi)

# Create a group factor indicating 'Women' and 'Men'
group <- factor(c(rep("Women", 364), rep("Men", 265)))

# Perform Levene's test for equality of variances
leveneTest(bmi ~ group)

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1       0  0.997
##       627

# Install the pwr package if not already installed
#install.packages("pwr")

# Load the pwr package
library(pwr)

# Set the parameters
effect_size <- 1   # Effect size (mean difference divided by pooled SD)
alpha <- 0.05          # Significance level
power <- 0.7           # Desired power

# Conduct the power analysis for a two-sample t-test
power_analysis <- pwr.t.test(d = effect_size, sig.level = alpha, power = power, type = "two.sample")

# View the result
power_analysis

## 
##      Two-sample t test power calculation 
## 
##               n = 13.37179
##               d = 1
##       sig.level = 0.05
##           power = 0.7
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

# Computer Session 2

library(Publish)

## Loading required package: prodlim

ci.mean(norsjo$sbp)# Confidence interval for systolic blood pressure

##  mean   CI-95%         
##  124.48 [122.14;126.81]

table(norsjo$smoker)

## 
## non-smoker     smoker 
##        206         54

?ci.mean

ci.mean(norsjo$sbp~norsjo$smoker, data = norsjo)

##  norsjo$smoker mean   CI-95%         
##  non-smoker    125.61 [123.01;128.21]
##  smoker        120.24 [114.95;125.54]

ci.mean(norsjo$sbp~norsjo$sex, data = norsjo)

##  norsjo$sex mean   CI-95%         
##  man        125.33 [122.21;128.45]
##  woman      123.65 [120.14;127.15]

#install.packages("plyr")
library(plyr)

## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact

norsjo$agegrp.cat <- as.factor(norsjo$agegrp)
norsjo$agegrp2 <- revalue(norsjo$agegrp.cat, c('30 years old'="1", '40 years old'="1", '50 years old'="2", '60 years old'="2"))
table(norsjo$agegrp2)

## 
##   1   2 
## 131 129

ci.mean(norsjo$sbp~norsjo$agegrp2, data = norsjo)

##  norsjo$agegrp2 mean   CI-95%         
##  1              117.12 [114.52;119.71]
##  2              132.02 [128.54;135.49]

?recode()
library(dplyr)
norsjo$age_cat <- recode_factor(
  norsjo$agegrp2,
  `1` = "Below 49 Years",
  `2` = "50 and Above 50 years",
  .default = "D"
)


ci.mean(norsjo$sbp~norsjo$age_cat, data = norsjo)

##  norsjo$age_cat        mean   CI-95%         
##  Below 49 Years        117.12 [114.52;119.71]
##  50 and Above 50 years 132.02 [128.54;135.49]

library(tidyverse)
ggplot(norsjo, aes(sample=sbp))+stat_qq() + stat_qq_line() +
  labs(title="Q-Q plot", y = "SBP")+  theme_classic()

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_qq()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

ggplot(norsjo, aes(sample=dbp))+stat_qq() + stat_qq_line()

## Warning: Removed 3 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 3 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

ggplot(norsjo, aes(sample=dbp))+stat_qq() + stat_qq_line() +
  labs(title="Q-Q plot", y = "DBP")

## Warning: Removed 3 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 3 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

library(haven)
paired<-read_dta("paired_data.dta")

t.test(paired$Clock_8, paired$Clock_20, paired=TRUE)

## 
##  Paired t-test
## 
## data:  paired$Clock_8 and paired$Clock_20
## t = 2.0847, df = 19, p-value = 0.05083
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.004397094  2.204397094
## sample estimates:
## mean difference 
##             1.1

paired$diff<-paired$Clock_20-paired$Clock_8 # Calculating a variable with the difference between afternoon and morning
sd.diff=sd(paired$diff)/sqrt(nrow(paired)) # Calculating standard error of the mean 
testvalue=mean(paired$diff)/sd.diff # Compare test-value with critical value



wilcox.test(paired$Clock_8, paired$Clock_20, paired = TRUE, alternative = "two.sided",exact=FALSE)

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  paired$Clock_8 and paired$Clock_20
## V = 129.5, p-value = 0.056
## alternative hypothesis: true location shift is not equal to 0

t.test(sbp~sex,data=norsjo,var.equal=T)# groups are separated by sex

## 
##  Two Sample t-test
## 
## data:  sbp by sex
## t = 0.70989, df = 255, p-value = 0.4784
## alternative hypothesis: true difference in means between group man and group woman is not equal to 0
## 95 percent confidence interval:
##  -2.988604  6.357713
## sample estimates:
##   mean in group man mean in group woman 
##            125.3307            123.6462

t_bmi_sex<-t.test(bmi~sex,data=norsjo,var.equal=T) # Stores the output of the function in a list 
t_bmi_sex[] # Writing only t will give you all information in the list

## $statistic
##         t 
## 0.2024534 
## 
## $parameter
##  df 
## 254 
## 
## $p.value
## [1] 0.8397243
## 
## $conf.int
## [1] -0.8811714  1.0831034
## attr(,"conf.level")
## [1] 0.95
## 
## $estimate
##   mean in group man mean in group woman 
##            25.32063            25.21966 
## 
## $null.value
## difference in means between group man and group woman 
##                                                     0 
## 
## $stderr
## [1] 0.4987121
## 
## $alternative
## [1] "two.sided"
## 
## $method
## [1] " Two Sample t-test"
## 
## $data.name
## [1] "bmi by sex"

t_dbp_sex<-t.test(dbp~sex,data=norsjo,var.equal=T) # Stores the output of the function in a list 
t_dbp_sex[] # Writing only t will give you all information in the list

## $statistic
##         t 
## 0.7340719 
## 
## $parameter
##  df 
## 255 
## 
## $p.value
## [1] 0.4635793
## 
## $conf.int
## [1] -1.800848  3.941248
## attr(,"conf.level")
## [1] 0.95
## 
## $estimate
##   mean in group man mean in group woman 
##            79.70866            78.63846 
## 
## $null.value
## difference in means between group man and group woman 
##                                                     0 
## 
## $stderr
## [1] 1.457895
## 
## $alternative
## [1] "two.sided"
## 
## $method
## [1] " Two Sample t-test"
## 
## $data.name
## [1] "dbp by sex"

t_Chol_sex<-t.test(norsjo$cholesterol~sex,data=norsjo,var.equal=T) # Stores the output of the function in a list 
t_Chol_sex[] # Writing only t will give you all information in the list

## $statistic
##         t 
## 0.4769153 
## 
## $parameter
##  df 
## 255 
## 
## $p.value
## [1] 0.6338309
## 
## $conf.int
## [1] -0.2757759  0.4520318
## attr(,"conf.level")
## [1] 0.95
## 
## $estimate
##   mean in group man mean in group woman 
##            6.580952            6.492824 
## 
## $null.value
## difference in means between group man and group woman 
##                                                     0 
## 
## $stderr
## [1] 0.1847874
## 
## $alternative
## [1] "two.sided"
## 
## $method
## [1] " Two Sample t-test"
## 
## $data.name
## [1] "norsjo$cholesterol by sex"

publish(t.test(sbp~agegrp2,data=norsjo,var.equal=T))

## 
##  The Two Sample t-test to compare the difference in means between group 1 and group 2 for sbp by agegrp2 yields a mean difference of 14.90 (CI-95% = [-19.20;-10.60]; p-value = <0.001).

library(car)

leveneTest(sbp ~ agegrp2, data = norsjo)

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   1  5.2111 0.02327 *
##       255                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

leveneTest(sbp ~ sex, data = norsjo)

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  2.4154 0.1214
##       255

t.test(sbp ~ agegrp2, data = filter(norsjo, sex=="man"),var.equal=T)

## 
##  Two Sample t-test
## 
## data:  sbp by agegrp2
## t = -3.642, df = 125, p-value = 0.0003948
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -16.927289  -5.007537
## sample estimates:
## mean in group 1 mean in group 2 
##        120.1493        131.1167

t.test(sbp ~ agegrp2, data = filter(norsjo, sex=="woman"),var.equal=T)

## 
##  Two Sample t-test
## 
## data:  sbp by agegrp2
## t = -6.0335, df = 128, p-value = 1.611e-08
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -25.14071 -12.72331
## sample estimates:
## mean in group 1 mean in group 2 
##        113.8889        132.8209

table(norsjo$sex)

## 
##   man woman 
##   128   132

wilcox.test(bmi~sex,data=norsjo)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  bmi by sex
## W = 9196.5, p-value = 0.08857
## alternative hypothesis: true location shift is not equal to 0

R learning session 1

BM

2024-10-22

R Markdown

Including Plots