#Pakete laden
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(rmarkdown)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ stringr 1.5.0
## ✔ purrr 1.0.1 ✔ tibble 3.1.8
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ mosaic::count() masks dplyr::count()
## ✖ purrr::cross() masks mosaic::cross()
## ✖ mosaic::do() masks dplyr::do()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ mosaic::stat() masks ggplot2::stat()
## ✖ mosaic::tally() masks dplyr::tally()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
#Einlesen des FOM Datensatzes
Weihnachtsumfrage_FOM <- read_csv2("Weihnachtsumfrage FOM.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 12000 Columns: 112── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (24): Source.Name, FOM, D2, Bundesland, X1, X2, X4, X10.1, X10.2, X10.3,...
## dbl (88): X11.1, X3.1, X3.2, X3.3, X3.4, X5.1, X5.2, X5.3, X5.4, X6.1, X6.2,...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Variablen selektieren
Weihnachtsumfrage_FOM<-
Weihnachtsumfrage_FOM%>%
select(D2, D1, X11.10, X11.2)
#Variablen umbenennen
Weihnachtsumfrage_FOM1<-
Weihnachtsumfrage_FOM%>%
rename(Geschlecht=D2)%>%
rename(Alter=D1)%>%
rename(Ausgaben=X11.10)%>%
rename(Gesamtausgaben=X11.2)
#Datensatz um fehlende Daten bereinigen
Weihnachtsumfrage_2FOM<-na.omit(Weihnachtsumfrage_FOM1)
#fehlerhafte Variable umcodieren
Weihnachtsumfrage_4FOM<-Weihnachtsumfrage_2FOM%>%
mutate(Geschlecht=ifelse(Geschlecht=="weiblich","Frau","Mann"))
#zu untersuchende Altersgruppe selektieren
select_age<-Weihnachtsumfrage_4FOM%>%
select(Geschlecht, Alter, Ausgaben,Gesamtausgaben)%>%
filter(Alter>24)%>%
filter(Alter<32)%>%
filter(Geschlecht!="divers")
#Variablen Zählen
select_age%>%
count(Geschlecht)
## # A tibble: 2 × 2
## Geschlecht n
## <chr> <int>
## 1 Frau 640
## 2 Mann 628
#Säulendiagramm Geschlechter
gf_bar(~Geschlecht, data=select_age)

#Verteilung Ausgaben Altersgruppe 24-32
gf_histogram(~Ausgaben, data=select_age)

#Außreißer identifizieren
gf_boxplot(Ausgaben~Geschlecht, data=select_age)

tally(~Geschlecht, data=select_age)
## Geschlecht
## Frau Mann
## 640 628
tally(~Geschlecht, format="proportion", data=select_age)
## Geschlecht
## Frau Mann
## 0.5047319 0.4952681
lm1<-lm(Ausgaben~Gesamtausgaben, data=select_age)
cooks<-cooks.distance(lm1)
cooks %>% head(3)
## 1 2 3
## 1.447752e-04 1.734915e-05 7.331057e-06
gf_point(cooks~1:length(cooks))

newselect_age<-select_age%>%
filter(Ausgaben<500)%>%
filter(Geschlecht!="divers")
gf_point(Ausgaben~Geschlecht, data=newselect_age)

#Ausgaben Altersgruppe
favstats(Ausgaben~Geschlecht, data=newselect_age)
## Geschlecht min Q1 median Q3 max mean sd n missing
## 1 Frau 0 200 250 300 470 254.4925 106.4869 469 0
## 2 Mann 0 150 250 300 450 237.8881 110.0730 438 0
###Inferenzstatistik
#lineare Regression
#t-Test
mean(Ausgaben~Geschlecht, data= newselect_age)
## Frau Mann
## 254.4925 237.8881
dEffekt_A1<-diffmean(Ausgaben~Geschlecht, data=newselect_age)
Null_A3<-do(10000)*diffmean(Ausgaben~shuffle(Geschlecht), data=newselect_age)
pVal_A3<-prop(~abs(diffmean)>=abs(dEffekt_A1), data=Null_A3)
gf_point(Ausgaben~Geschlecht, data= newselect_age)

#multible Regression
lRes_S<-lm(Ausgaben~Geschlecht, data=newselect_age)
coef(lRes_S)
## (Intercept) GeschlechtMann
## 254.49254 -16.60441
rsquared(lRes_S)
## [1] 0.005855514
lRes_S<-lm(Ausgaben~Geschlecht, data=newselect_age)
summary(lRes_S)
##
## Call:
## lm(formula = Ausgaben ~ Geschlecht, data = newselect_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -254.493 -87.888 -4.493 62.112 215.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 254.493 4.998 50.921 <2e-16 ***
## GeschlechtMann -16.604 7.192 -2.309 0.0212 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 108.2 on 905 degrees of freedom
## Multiple R-squared: 0.005856, Adjusted R-squared: 0.004757
## F-statistic: 5.33 on 1 and 905 DF, p-value: 0.02118
dBeta_Geschlecht<-coef(lRes_S)[2]
set.seed(2021)
Bootvtlg<-do(10000)*lm(Ausgaben~Geschlecht,data=resample(newselect_age))
gf_histogram(~GeschlechtMann, data=Bootvtlg)%>%
gf_vline(xintercept=0)

inspect(Bootvtlg)
##
## quantitative variables:
## name class min Q1 median
## 1 Intercept numeric 2.358985e+02 2.511951e+02 2.544968e+02
## 2 GeschlechtMann numeric -4.129403e+01 -2.149550e+01 -1.655483e+01
## 3 sigma numeric 1.002674e+02 1.067633e+02 1.081066e+02
## 4 r.squared numeric 1.041630e-08 2.894677e-03 5.809503e-03
## 5 F numeric 9.426754e-06 2.627288e+00 5.288322e+00
## 6 numdf numeric 1.000000e+00 1.000000e+00 1.000000e+00
## 7 dendf numeric 9.050000e+02 9.050000e+02 9.050000e+02
## 8 .row integer 1.000000e+00 1.000000e+00 1.000000e+00
## 9 .index numeric 1.000000e+00 2.500750e+03 5.000500e+03
## Q3 max mean sd n missing
## 1 2.577812e+02 2.717081e+02 2.544799e+02 4.970685e+00 10000 0
## 2 -1.166038e+01 8.663375e+00 -1.658135e+01 7.241891e+00 10000 0
## 3 1.094442e+02 1.162784e+02 1.081037e+02 1.992524e+00 10000 0
## 4 9.792637e-03 3.566888e-02 6.931131e-03 5.265433e-03 10000 0
## 5 8.949981e+00 3.347433e+01 6.342227e+00 4.861508e+00 10000 0
## 6 1.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 10000 0
## 7 9.050000e+02 9.050000e+02 9.050000e+02 0.000000e+00 10000 0
## 8 1.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 10000 0
## 9 7.500250e+03 1.000000e+04 5.000500e+03 2.886896e+03 10000 0
qdata(~GeschlechtMann, p=c(0.025,0.975), data=Bootvtlg)
## 2.5% 97.5%
## -30.673886 -2.385298
#multible Regression
abw.stipro<-diffmean(Ausgaben~Geschlecht, data=newselect_age)
#Simulation der Daten unter der Annahme H0
set.seed(2021)
Nullvtlg_metr<-do(10000)*diffmean(Ausgaben~shuffle(Geschlecht), data=newselect_age)
vQN_m<-qdata(~diffmean, data=Nullvtlg_metr, p=c(0.025,0.975))
gf_histogram(~diffmean, data=Nullvtlg_metr) %>%
gf_vline(xintercept = ~vQN_m[1]) %>%
gf_vline(xintercept = ~vQN_m[2]) %>%
gf_vline(xintercept = ~abw.stipro, color="red")

PWert_m<-prop(~abs(diffmean)>=abs(abw.stipro), data=Nullvtlg_metr)