Setting up RMarkdown when opening it enables you to create dynamic, reproducible, and visually appealing reports, presentations, and documents, that can help you communicate your data analysis and research findings more effectively.
library(stargazer)
library(dplyr)
library(gtsummary)
library(magrittr)
library(ggplot2)
library(knitr)
library(flextable)
library(report)
library(broom)
library(gapminder)
library(xtable)
library(gridExtra)
library(ggplot2)
library(dplyr)
library(jtools)
library(gtsummary)
library(broom)
y <- 50 + 50
y
[1] 100
data <- read.csv("height.csv")
attach(data)
head(data,5)
Height Gender
1 145.1679 Female
2 137.3663 Female
3 142.6847 Female
4 145.5567 Female
5 151.3897 Female
tail(data,5)
Height Gender
1996 172.0931 Male
1997 164.7082 Male
1998 167.3682 Male
1999 171.2961 Male
2000 173.0944 Male
str(data)
'data.frame': 2000 obs. of 2 variables:
$ Height: num 145 137 143 146 151 ...
$ Gender: chr "Female" "Female" "Female" "Female" ...
summary(data$Height)
Min. 1st Qu. Median Mean 3rd Qu. Max.
132.1 145.0 153.5 155.0 165.2 180.2
library(gtsummary)
library(stargazer)
library(magrittr)
library(knitr)
data [,c(1,2)] %>%
tbl_summary(by = Gender,
statistic = list(all_continuous() ~ "{mean} {median} {sd} {max} {min}"),
digits = list(Height ~ 2)) %>%
add_p() %>%
add_overall() %>%
bold_labels()
| Characteristic | Overall, N = 2,0001 | Female, N = 1,0001 | Male, N = 1,0001 | p-value2 |
|---|---|---|---|---|
| Height | 155.04 153.50 11.10 180.19 132.12 | 144.94 145.04 4.00 159.25 132.12 | 165.14 165.17 5.12 180.19 147.18 | <0.001 |
| 1 Mean Median SD Maximum Minimum | ||||
| 2 Wilcoxon rank sum test | ||||
data1 <- read.csv("vocational_training.csv")
head(data1,5)
Student.ID Age Dual.Training Gender Academic.Performance Vocational.Skills
1 1 20 1 Male High Intermediate
2 2 22 1 Female Medium Advanced
3 3 19 0 Male Low Beginner
4 4 21 0 Female High Advanced
5 5 18 1 Male Medium Intermediate
Math.Score Science.Score Communication.Skills Problem.Solving.Skills
1 85 90 Excellent Advanced
2 75 80 Good Intermediate
3 60 70 Average Beginner
4 90 85 Excellent Advanced
5 80 75 Good Intermediate
Industrial.Project.Score
1 92
2 78
3 65
4 88
5 82
tail(data1,5)
Student.ID Age Dual.Training Gender Academic.Performance Vocational.Skills
36 36 24 1 Female High Intermediate
37 37 19 0 Male Medium Advanced
38 38 23 1 Male Medium Intermediate
39 39 26 0 Male Low Beginner
40 40 20 1 Female Low Intermediate
Math.Score Science.Score Communication.Skills Problem.Solving.Skills
36 68 60 Average Intermediate
37 75 74 Good Beginner
38 60 55 Average Advanced
39 55 60 Good Intermediate
40 78 75 Good Intermediate
Industrial.Project.Score
36 80
37 60
38 78
39 68
40 77
attach(data1)
data1 [,c(6,11)] %>%
tbl_summary(by = Vocational.Skills,
statistic = list(all_continuous() ~ "{mean} {median} {sd} {max} {min}"),
digits = list(Industrial.Project.Score ~ 2)) %>%
add_p() %>%
add_overall() %>%
bold_labels()
| Characteristic | Overall, N = 401 | Advanced, N = 161 | Beginner, N = 81 | Intermediate, N = 161 | p-value2 |
|---|---|---|---|---|---|
| Industrial.Project.Score | 80.85 81.00 9.48 96.00 60.00 | 84.88 88.00 9.21 96.00 60.00 | 67.38 68.00 2.13 70.00 65.00 | 83.56 82.00 4.98 92.00 77.00 | <0.001 |
| 1 Mean Median SD Maximum Minimum | |||||
| 2 Kruskal-Wallis rank sum test | |||||
stargazer(data1[,-3][,-1], type = "text")
===================================================
Statistic N Mean St. Dev. Min Max
---------------------------------------------------
Age 40 20.525 1.739 18 26
Math.Score 40 78.250 10.874 55 94
Science.Score 40 78.900 9.328 55 94
Industrial.Project.Score 40 80.850 9.480 60 96
---------------------------------------------------
data2 <- read.csv("sales.csv")
head(data2)
year sales
1 1801 581.4157
2 1802 464.9344
3 1803 562.6408
4 1804 515.5165
5 1805 397.6773
6 1806 521.4371
tail(data2)
year sales
215 2015 494.5944
216 2016 415.9220
217 2017 482.3292
218 2018 584.5696
219 2019 524.3384
220 2020 427.6165
attach(data2)
One_sample <- t.test(sales, mu=500, alternative = "two.sided", data=data2)
One_sample
One Sample t-test
data: sales
t = -0.36718, df = 219, p-value = 0.7138
alternative hypothesis: true mean is not equal to 500
95 percent confidence interval:
492.1175 505.4067
sample estimates:
mean of x
498.7621
library(rempsyc)
model01 <- t.test(sales, mu=500, data=data2, alternative="two.sided")
stats.table01 <- tidy(model01, conf.int = TRUE)
stats.table01
# A tibble: 1 × 8
estimate statistic p.value parameter conf.low conf.high method alternative
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 499. -0.367 0.714 219 492. 505. One Sampl… two.sided
before<-c(117,111,98,104,105,100,81,89,78)
after<-c(83,85,75,82,82,77,62,69,64)
mydata<-data.frame(before,after)
head(mydata,5)
before after
1 117 83
2 111 85
3 98 75
4 104 82
5 105 82
TTEST<-t.test(before,after,paired=T, data=mydata, alternative = "less")
TTEST
Paired t-test
data: before and after
t = 12.52, df = 8, p-value = 1
alternative hypothesis: true mean difference is less than 0
95 percent confidence interval:
-Inf 26.03331
sample estimates:
mean difference
22.66667
library(rempsyc)
model0 <- t.test(before,after, alternative="less", paired = T, data = mydata)
stats.table0 <- tidy(model0, conf.int = TRUE)
stats.table0
# A tibble: 1 × 8
estimate statistic p.value parameter conf.low conf.high method alternative
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 22.7 12.5 1.00 8 -Inf 26.0 Paired t-… less
weight<-c(135,180,108,128,160,143,175,170,205,195,185,150,175,190,180,220)
weight
[1] 135 180 108 128 160 143 175 170 205 195 185 150 175 190 180 220
gender<-gl(2,8,labels=c("male","female"))
gender
[1] male male male male male male male male female female
[11] female female female female female female
Levels: male female
paired_t_test<-data.frame(weight,gender)
head(paired_t_test,5)
weight gender
1 135 male
2 180 male
3 108 male
4 128 male
5 160 male
#For this case to test for t-test, we use the following command;
t.test(weight~gender,var.equal=T, data = paired_t_test, paired=F)
Two Sample t-test
data: weight by gender
t = -3.2304, df = 14, p-value = 0.006044
alternative hypothesis: true difference in means between group male and group female is not equal to 0
95 percent confidence interval:
-62.60587 -12.64413
sample estimates:
mean in group male mean in group female
149.875 187.500
library(rempsyc)
model1 <- t.test(weight ~ gender, alternative="two.sided", data = paired_t_test)
stats.table1 <- tidy(model1, conf.int = TRUE)
stats.table1
# A tibble: 1 × 10
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -37.6 150. 188. -3.23 0.00631 13.5 -62.7 -12.6
# ℹ 2 more variables: method <chr>, alternative <chr>
library(rempsyc)
nice_table(stats.table1, broom = "t.test")
Method | Alternative | Mean 1 | Mean 2 | M1 - M2 | t | df | p | 95% CI |
|---|---|---|---|---|---|---|---|---|
Welch Two Sample t-test | two.sided | 149.88 | 187.50 | -37.62 | -3.23 | 13.48 | .006 | [-62.70, -12.55] |
head(data,5)
Height Gender
1 145.1679 Female
2 137.3663 Female
3 142.6847 Female
4 145.5567 Female
5 151.3897 Female
model2 <- t.test(Height ~ Gender, alternative="two.sided", data = data)
stats.table <- tidy(model2, conf.int = TRUE)
library(rempsyc)
nice_table(stats.table, broom = "t.test")
Method | Alternative | Mean 1 | Mean 2 | M1 - M2 | t | df | p | 95% CI |
|---|---|---|---|---|---|---|---|---|
Welch Two Sample t-test | two.sided | 144.94 | 165.14 | -20.20 | -98.35 | 1,888.67 | < .001 | [-20.61, -19.80] |
yield <- read.csv("yields.csv")
attach(yield)
head(yield,5)
Yields Fertilizer.Used Blocks
1 54.40207 NPK Block1
2 41.00511 NPK Block1
3 38.92977 NPK Block1
4 54.74481 NPK Block1
5 46.72406 NPK Block1
str(yield)
'data.frame': 400 obs. of 3 variables:
$ Yields : num 54.4 41 38.9 54.7 46.7 ...
$ Fertilizer.Used: chr "NPK" "NPK" "NPK" "NPK" ...
$ Blocks : chr "Block1" "Block1" "Block1" "Block1" ...
anova_model<-aov(Yields~Fertilizer.Used)
summary(anova_model)
Df Sum Sq Mean Sq F value Pr(>F)
Fertilizer.Used 3 29 9.7 0.077 0.973
Residuals 396 50191 126.8
From the results above, the F statistics is given as 0.077 with its associated p-value of approximately 0.973, which is significantly greater than 0.05. This is an indication that there is no significant difference in the average maize yields across the type of fertilizer use at a 5% level of significance.
Tukey HSD is the commonly used post hoc test to determine which two categories are differing in sales from 2016 to 2019.
tukey_hsd<-TukeyHSD(anova_model)
tukey_hsd
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Yields ~ Fertilizer.Used)
$Fertilizer.Used
diff lwr upr p adj
DAP-Ammonia 0.08942084 -3.915192 4.094033 0.9999313
NPK-Ammonia -0.10428473 -4.211957 4.003387 0.9998990
Phosphate-Ammonia 0.62461376 -3.608078 4.857306 0.9811932
NPK-DAP -0.19370557 -4.198318 3.810907 0.9993047
Phosphate-DAP 0.53519292 -3.597558 4.667944 0.9871308
Phosphate-NPK 0.72889849 -3.503793 4.961590 0.9706912
plot(tukey_hsd, las=1)
anova_model2 <- aov(Yields~Blocks, data = yield)
summary(anova_model2)
Df Sum Sq Mean Sq F value Pr(>F)
Blocks 3 22466 7489 106.8 <2e-16 ***
Residuals 396 27754 70
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey_hsd1<-TukeyHSD(anova_model2)
tukey_hsd1
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Yields ~ Blocks, data = yield)
$Blocks
diff lwr upr p adj
Block2-Block1 14.649453 11.594896 17.704010 0.00e+00
Block3-Block1 5.677238 2.622681 8.731795 1.37e-05
Block4-Block1 -5.636546 -8.691103 -2.581989 1.61e-05
Block3-Block2 -8.972215 -12.026772 -5.917658 0.00e+00
Block4-Block2 -20.286000 -23.340557 -17.231442 0.00e+00
Block4-Block3 -11.313784 -14.368341 -8.259227 0.00e+00
plot(tukey_hsd1, las=1)
ANOVA stands for analysis of variance and tests for differences in the effects of independent variables on a dependent variable. A two-way ANOVA test is a statistical test used to determine the effect of two nominal predictor variables on a continuous outcome variable.
yield$Fertilizer.Used<-as.factor(yield$Fertilizer.Used)
yield$Blocks<- as.factor((yield$Blocks))
linear_model<- lm(Yields~Fertilizer.Used+Blocks, data = yield)
Two_factor_anova<-anova(linear_model)
Two_factor_anova
Analysis of Variance Table
Response: Yields
Df Sum Sq Mean Sq F value Pr(>F)
Fertilizer.Used 3 29.1 9.7 0.1375 0.9376
Blocks 3 22455.5 7485.2 106.0614 <2e-16 ***
Residuals 393 27735.6 70.6
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
datta <- read.csv("Unemployment.csv")
attach(datta)
head(datta,5)
year Unemployment Inflation FedRate
1 1859 5.133333 0.9084719 3.933333
2 1860 5.233333 1.8107772 3.696667
3 1861 5.533333 1.6227203 2.936667
4 1862 6.266667 1.7953352 2.296667
5 1863 6.800000 0.5370330 2.003333