Set up Rstudio

Setting up RMarkdown when opening it enables you to create dynamic, reproducible, and visually appealing reports, presentations, and documents, that can help you communicate your data analysis and research findings more effectively.

Load the following libraries

library(stargazer)
library(dplyr)
library(gtsummary)
library(magrittr)
library(ggplot2)
library(knitr)
library(flextable)
library(report)
library(broom)
library(gapminder)
library(xtable)
library(gridExtra)
library(ggplot2)
library(dplyr)
library(jtools)
library(gtsummary)
library(broom)

RSTUDIO DAY ONE TRAINING

BASIC MATHEMATICA OPERATIONS

ADDITION

y <- 50 + 50
y
[1] 100

Data Importation

data <- read.csv("height.csv")
attach(data)

Check the first few and last observations

head(data,5)
    Height Gender
1 145.1679 Female
2 137.3663 Female
3 142.6847 Female
4 145.5567 Female
5 151.3897 Female
tail(data,5)
       Height Gender
1996 172.0931   Male
1997 164.7082   Male
1998 167.3682   Male
1999 171.2961   Male
2000 173.0944   Male

Check the structure of the dataset

str(data)
'data.frame':   2000 obs. of  2 variables:
 $ Height: num  145 137 143 146 151 ...
 $ Gender: chr  "Female" "Female" "Female" "Female" ...

Descriptive Statistics

summary(data$Height)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  132.1   145.0   153.5   155.0   165.2   180.2 
library(gtsummary)
library(stargazer)
library(magrittr)
library(knitr)
data [,c(1,2)] %>%
  tbl_summary(by = Gender,
              statistic = list(all_continuous() ~ "{mean} {median} {sd} {max} {min}"),
              digits = list(Height ~ 2)) %>%
  add_p() %>%
  add_overall() %>% 
  bold_labels()
Characteristic Overall, N = 2,0001 Female, N = 1,0001 Male, N = 1,0001 p-value2
Height 155.04 153.50 11.10 180.19 132.12 144.94 145.04 4.00 159.25 132.12 165.14 165.17 5.12 180.19 147.18 <0.001
1 Mean Median SD Maximum Minimum
2 Wilcoxon rank sum test

Import Vocational data

data1 <- read.csv("vocational_training.csv")
head(data1,5)
  Student.ID Age Dual.Training Gender Academic.Performance Vocational.Skills
1          1  20             1   Male                 High      Intermediate
2          2  22             1 Female               Medium          Advanced
3          3  19             0   Male                  Low          Beginner
4          4  21             0 Female                 High          Advanced
5          5  18             1   Male               Medium      Intermediate
  Math.Score Science.Score Communication.Skills Problem.Solving.Skills
1         85            90            Excellent               Advanced
2         75            80                 Good           Intermediate
3         60            70              Average               Beginner
4         90            85            Excellent               Advanced
5         80            75                 Good           Intermediate
  Industrial.Project.Score
1                       92
2                       78
3                       65
4                       88
5                       82
tail(data1,5)
   Student.ID Age Dual.Training Gender Academic.Performance Vocational.Skills
36         36  24             1 Female                 High      Intermediate
37         37  19             0   Male               Medium          Advanced
38         38  23             1   Male               Medium      Intermediate
39         39  26             0   Male                  Low          Beginner
40         40  20             1 Female                  Low      Intermediate
   Math.Score Science.Score Communication.Skills Problem.Solving.Skills
36         68            60              Average           Intermediate
37         75            74                 Good               Beginner
38         60            55              Average               Advanced
39         55            60                 Good           Intermediate
40         78            75                 Good           Intermediate
   Industrial.Project.Score
36                       80
37                       60
38                       78
39                       68
40                       77
attach(data1)

Summary Statistics of Industrial Project Scores across Vocational Skills

data1 [,c(6,11)] %>%
  tbl_summary(by = Vocational.Skills,
              statistic = list(all_continuous() ~ "{mean} {median} {sd} {max} {min}"),
              digits = list(Industrial.Project.Score ~ 2)) %>%
  add_p() %>%
  add_overall() %>% 
  bold_labels()
Characteristic Overall, N = 401 Advanced, N = 161 Beginner, N = 81 Intermediate, N = 161 p-value2
Industrial.Project.Score 80.85 81.00 9.48 96.00 60.00 84.88 88.00 9.21 96.00 60.00 67.38 68.00 2.13 70.00 65.00 83.56 82.00 4.98 92.00 77.00 <0.001
1 Mean Median SD Maximum Minimum
2 Kruskal-Wallis rank sum test
stargazer(data1[,-3][,-1], type = "text")

===================================================
Statistic                N   Mean  St. Dev. Min Max
---------------------------------------------------
Age                      40 20.525  1.739   18  26 
Math.Score               40 78.250  10.874  55  94 
Science.Score            40 78.900  9.328   55  94 
Industrial.Project.Score 40 80.850  9.480   60  96 
---------------------------------------------------

One sample t-test

Import Sales Data

data2 <- read.csv("sales.csv")
head(data2)
  year    sales
1 1801 581.4157
2 1802 464.9344
3 1803 562.6408
4 1804 515.5165
5 1805 397.6773
6 1806 521.4371
tail(data2)
    year    sales
215 2015 494.5944
216 2016 415.9220
217 2017 482.3292
218 2018 584.5696
219 2019 524.3384
220 2020 427.6165
attach(data2)

Null and alternative hypothesis

One_sample <- t.test(sales, mu=500, alternative = "two.sided", data=data2)
One_sample

    One Sample t-test

data:  sales
t = -0.36718, df = 219, p-value = 0.7138
alternative hypothesis: true mean is not equal to 500
95 percent confidence interval:
 492.1175 505.4067
sample estimates:
mean of x 
 498.7621 

Alternative Way to Display the Results Above

library(rempsyc)
model01 <- t.test(sales, mu=500, data=data2, alternative="two.sided")

Broom Table

stats.table01 <- tidy(model01, conf.int = TRUE)
stats.table01
# A tibble: 1 × 8
  estimate statistic p.value parameter conf.low conf.high method     alternative
     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl> <chr>      <chr>      
1     499.    -0.367   0.714       219     492.      505. One Sampl… two.sided  

Interpret the Results

Two Sample (Dependent T-test)

before<-c(117,111,98,104,105,100,81,89,78)
after<-c(83,85,75,82,82,77,62,69,64)

Create the data frame

mydata<-data.frame(before,after)
head(mydata,5)
  before after
1    117    83
2    111    85
3     98    75
4    104    82
5    105    82

Perform the Test

TTEST<-t.test(before,after,paired=T, data=mydata, alternative = "less")
TTEST

    Paired t-test

data:  before and after
t = 12.52, df = 8, p-value = 1
alternative hypothesis: true mean difference is less than 0
95 percent confidence interval:
     -Inf 26.03331
sample estimates:
mean difference 
       22.66667 

Alternative Way of Displaying the Results Above

library(rempsyc)
model0 <- t.test(before,after, alternative="less", paired = T, data = mydata)

Broom Table

stats.table0 <- tidy(model0, conf.int = TRUE)
stats.table0
# A tibble: 1 × 8
  estimate statistic p.value parameter conf.low conf.high method     alternative
     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl> <chr>      <chr>      
1     22.7      12.5    1.00         8     -Inf      26.0 Paired t-… less       

Interpretation

Two Sample T-test (Independent T-Test)

Enter you data

weight<-c(135,180,108,128,160,143,175,170,205,195,185,150,175,190,180,220)
weight
 [1] 135 180 108 128 160 143 175 170 205 195 185 150 175 190 180 220

Create the category group

gender<-gl(2,8,labels=c("male","female"))
gender
 [1] male   male   male   male   male   male   male   male   female female
[11] female female female female female female
Levels: male female

Create the dataframe

paired_t_test<-data.frame(weight,gender)
head(paired_t_test,5)
  weight gender
1    135   male
2    180   male
3    108   male
4    128   male
5    160   male

Perform the test

#For this case to test for t-test, we use the following command;
t.test(weight~gender,var.equal=T, data = paired_t_test, paired=F)

    Two Sample t-test

data:  weight by gender
t = -3.2304, df = 14, p-value = 0.006044
alternative hypothesis: true difference in means between group male and group female is not equal to 0
95 percent confidence interval:
 -62.60587 -12.64413
sample estimates:
  mean in group male mean in group female 
             149.875              187.500 

Alternative Way of Displaying the Results Above

library(rempsyc)
model1 <- t.test(weight ~ gender, alternative="two.sided", data = paired_t_test)

Broom Table

stats.table1 <- tidy(model1, conf.int = TRUE)
stats.table1
# A tibble: 1 × 10
  estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
     <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl>
1    -37.6      150.      188.     -3.23 0.00631      13.5    -62.7     -12.6
# ℹ 2 more variables: method <chr>, alternative <chr>

Present the results

library(rempsyc)
nice_table(stats.table1, broom = "t.test")

Method

Alternative

Mean 1

Mean 2

M1 - M2

t

df

p

95% CI

Welch Two Sample t-test

two.sided

149.88

187.50

-37.62

-3.23

13.48

.006

[-62.70, -12.55]

Interpretation

Additional Examples

Load the data set

head(data,5)
    Height Gender
1 145.1679 Female
2 137.3663 Female
3 142.6847 Female
4 145.5567 Female
5 151.3897 Female
model2 <- t.test(Height ~ Gender, alternative="two.sided", data = data)

Broom Table

stats.table <- tidy(model2, conf.int = TRUE)

Present the results

library(rempsyc)
nice_table(stats.table, broom = "t.test")

Method

Alternative

Mean 1

Mean 2

M1 - M2

t

df

p

95% CI

Welch Two Sample t-test

two.sided

144.94

165.14

-20.20

-98.35

1,888.67

< .001

[-20.61, -19.80]

One-way Analysis of Variance

Load the dataset

yield <- read.csv("yields.csv")
attach(yield)
head(yield,5)
    Yields Fertilizer.Used Blocks
1 54.40207             NPK Block1
2 41.00511             NPK Block1
3 38.92977             NPK Block1
4 54.74481             NPK Block1
5 46.72406             NPK Block1

Structure of the data set

str(yield)
'data.frame':   400 obs. of  3 variables:
 $ Yields         : num  54.4 41 38.9 54.7 46.7 ...
 $ Fertilizer.Used: chr  "NPK" "NPK" "NPK" "NPK" ...
 $ Blocks         : chr  "Block1" "Block1" "Block1" "Block1" ...

One way ANOVA

anova_model<-aov(Yields~Fertilizer.Used) 
summary(anova_model)
                 Df Sum Sq Mean Sq F value Pr(>F)
Fertilizer.Used   3     29     9.7   0.077  0.973
Residuals       396  50191   126.8               

From the results above, the F statistics is given as 0.077 with its associated p-value of approximately 0.973, which is significantly greater than 0.05. This is an indication that there is no significant difference in the average maize yields across the type of fertilizer use at a 5% level of significance.

Post Hoc Test

Tukey HSD is the commonly used post hoc test to determine which two categories are differing in sales from 2016 to 2019.

tukey_hsd<-TukeyHSD(anova_model) 
tukey_hsd
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = Yields ~ Fertilizer.Used)

$Fertilizer.Used
                         diff       lwr      upr     p adj
DAP-Ammonia        0.08942084 -3.915192 4.094033 0.9999313
NPK-Ammonia       -0.10428473 -4.211957 4.003387 0.9998990
Phosphate-Ammonia  0.62461376 -3.608078 4.857306 0.9811932
NPK-DAP           -0.19370557 -4.198318 3.810907 0.9993047
Phosphate-DAP      0.53519292 -3.597558 4.667944 0.9871308
Phosphate-NPK      0.72889849 -3.503793 4.961590 0.9706912

Plot the Results

plot(tukey_hsd, las=1)

One way ANOVA of average yield across blocks

anova_model2 <- aov(Yields~Blocks, data = yield)
summary(anova_model2)
             Df Sum Sq Mean Sq F value Pr(>F)    
Blocks        3  22466    7489   106.8 <2e-16 ***
Residuals   396  27754      70                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Post Hoc Analysis

tukey_hsd1<-TukeyHSD(anova_model2) 
tukey_hsd1
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = Yields ~ Blocks, data = yield)

$Blocks
                    diff        lwr        upr    p adj
Block2-Block1  14.649453  11.594896  17.704010 0.00e+00
Block3-Block1   5.677238   2.622681   8.731795 1.37e-05
Block4-Block1  -5.636546  -8.691103  -2.581989 1.61e-05
Block3-Block2  -8.972215 -12.026772  -5.917658 0.00e+00
Block4-Block2 -20.286000 -23.340557 -17.231442 0.00e+00
Block4-Block3 -11.313784 -14.368341  -8.259227 0.00e+00

Plot the Results

plot(tukey_hsd1, las=1)

Two-Way Analysis of Variance

What Is a 2-Way ANOVA?

ANOVA stands for analysis of variance and tests for differences in the effects of independent variables on a dependent variable. A two-way ANOVA test is a statistical test used to determine the effect of two nominal predictor variables on a continuous outcome variable.

Convert the string variables of interest into factors.

yield$Fertilizer.Used<-as.factor(yield$Fertilizer.Used) 
yield$Blocks<- as.factor((yield$Blocks))

Run the multiple linear model.

linear_model<- lm(Yields~Fertilizer.Used+Blocks, data = yield)

Convert the linear model above into a two way anova model

Two_factor_anova<-anova(linear_model) 
Two_factor_anova
Analysis of Variance Table

Response: Yields
                 Df  Sum Sq Mean Sq  F value Pr(>F)    
Fertilizer.Used   3    29.1     9.7   0.1375 0.9376    
Blocks            3 22455.5  7485.2 106.0614 <2e-16 ***
Residuals       393 27735.6    70.6                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Linear Regression Analysis

Load the data set

datta <- read.csv("Unemployment.csv")
attach(datta)
head(datta,5)
  year Unemployment Inflation  FedRate
1 1859     5.133333 0.9084719 3.933333
2 1860     5.233333 1.8107772 3.696667
3 1861     5.533333 1.6227203 2.936667
4 1862     6.266667 1.7953352 2.296667
5 1863     6.800000 0.5370330 2.003333