#Pakete laden
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(rmarkdown)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.1     ✔ tibble    3.1.8
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ mosaic::count() masks dplyr::count()
## ✖ purrr::cross()  masks mosaic::cross()
## ✖ mosaic::do()    masks dplyr::do()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ mosaic::stat()  masks ggplot2::stat()
## ✖ mosaic::tally() masks dplyr::tally()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
#Einlesen des FOM Datensatzes
Weihnachtsumfrage_FOM <- read_csv2("Weihnachtsumfrage FOM.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 12000 Columns: 112── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (24): Source.Name, FOM, D2, Bundesland, X1, X2, X4, X10.1, X10.2, X10.3,...
## dbl (88): X11.1, X3.1, X3.2, X3.3, X3.4, X5.1, X5.2, X5.3, X5.4, X6.1, X6.2,...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Variablen selektieren
Weihnachtsumfrage_FOM<-
  Weihnachtsumfrage_FOM%>%
  select(D2, D1, X11.10, X11.2)
  
#Variablen umbenennen
Weihnachtsumfrage_FOM1<-
  Weihnachtsumfrage_FOM%>%
  rename(Geschlecht=D2)%>%
  rename(Alter=D1)%>%
  rename(Ausgaben=X11.10)%>%
  rename(Gesamtausgaben=X11.2)

#Datensatz um fehlende Daten bereinigen
Weihnachtsumfrage_2FOM<-na.omit(Weihnachtsumfrage_FOM1)

#fehlerhafte Variable umcodieren
Weihnachtsumfrage_4FOM<-Weihnachtsumfrage_2FOM%>%
  mutate(Geschlecht=ifelse(Geschlecht=="weiblich","Frau","Mann"))


#zu untersuchende Altersgruppe selektieren

select_age<-Weihnachtsumfrage_4FOM%>%
  select(Geschlecht, Alter, Ausgaben,Gesamtausgaben)%>%
  filter(Alter>24)%>%
  filter(Alter<32)%>%
  filter(Geschlecht!="divers")

#Variablen Zählen
select_age%>%
  count(Geschlecht)
## # A tibble: 2 × 2
##   Geschlecht     n
##   <chr>      <int>
## 1 Frau         640
## 2 Mann         628
#Säulendiagramm Geschlechter
gf_bar(~Geschlecht, data=select_age)

#Verteilung Ausgaben Altersgruppe 24-32
gf_histogram(~Ausgaben, data=select_age)

#Außreißer identifizieren
gf_boxplot(Ausgaben~Geschlecht, data=select_age)

tally(~Geschlecht, data=select_age)
## Geschlecht
## Frau Mann 
##  640  628
tally(~Geschlecht, format="proportion", data=select_age)
## Geschlecht
##      Frau      Mann 
## 0.5047319 0.4952681
lm1<-lm(Ausgaben~Gesamtausgaben, data=select_age)
cooks<-cooks.distance(lm1)
cooks %>% head(3)
##            1            2            3 
## 1.447752e-04 1.734915e-05 7.331057e-06
gf_point(cooks~1:length(cooks))

newselect_age<-select_age%>%
  filter(Ausgaben<500)%>%
  filter(Geschlecht!="divers")

gf_point(Ausgaben~Geschlecht, data=newselect_age)

#Ausgaben Altersgruppe
favstats(Ausgaben~Geschlecht, data=newselect_age)
##   Geschlecht min  Q1 median  Q3 max     mean       sd   n missing
## 1       Frau   0 200    250 300 470 254.4925 106.4869 469       0
## 2       Mann   0 150    250 300 450 237.8881 110.0730 438       0
###Inferenzstatistik
#lineare Regression
#t-Test
mean(Ausgaben~Geschlecht, data= newselect_age)
##     Frau     Mann 
## 254.4925 237.8881
dEffekt_A1<-diffmean(Ausgaben~Geschlecht, data=newselect_age)
Null_A3<-do(10000)*diffmean(Ausgaben~shuffle(Geschlecht), data=newselect_age)
pVal_A3<-prop(~abs(diffmean)>=abs(dEffekt_A1), data=Null_A3)
gf_point(Ausgaben~Geschlecht, data= newselect_age)

#multible Regression

lRes_S<-lm(Ausgaben~Geschlecht, data=newselect_age)
coef(lRes_S)
##    (Intercept) GeschlechtMann 
##      254.49254      -16.60441
rsquared(lRes_S)
## [1] 0.005855514
lRes_S<-lm(Ausgaben~Geschlecht, data=newselect_age)
summary(lRes_S)
## 
## Call:
## lm(formula = Ausgaben ~ Geschlecht, data = newselect_age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -254.493  -87.888   -4.493   62.112  215.507 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     254.493      4.998  50.921   <2e-16 ***
## GeschlechtMann  -16.604      7.192  -2.309   0.0212 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 108.2 on 905 degrees of freedom
## Multiple R-squared:  0.005856,   Adjusted R-squared:  0.004757 
## F-statistic:  5.33 on 1 and 905 DF,  p-value: 0.02118
dBeta_Geschlecht<-coef(lRes_S)[2]

set.seed(2021)
Bootvtlg<-do(10000)*lm(Ausgaben~Geschlecht,data=resample(newselect_age))
gf_histogram(~GeschlechtMann, data=Bootvtlg)%>%
  gf_vline(xintercept=0)

inspect(Bootvtlg)
## 
## quantitative variables:  
##             name   class           min            Q1        median
## 1      Intercept numeric  2.358985e+02  2.511951e+02  2.544968e+02
## 2 GeschlechtMann numeric -4.129403e+01 -2.149550e+01 -1.655483e+01
## 3          sigma numeric  1.002674e+02  1.067633e+02  1.081066e+02
## 4      r.squared numeric  1.041630e-08  2.894677e-03  5.809503e-03
## 5              F numeric  9.426754e-06  2.627288e+00  5.288322e+00
## 6          numdf numeric  1.000000e+00  1.000000e+00  1.000000e+00
## 7          dendf numeric  9.050000e+02  9.050000e+02  9.050000e+02
## 8           .row integer  1.000000e+00  1.000000e+00  1.000000e+00
## 9         .index numeric  1.000000e+00  2.500750e+03  5.000500e+03
##              Q3          max          mean           sd     n missing
## 1  2.577812e+02 2.717081e+02  2.544799e+02 4.970685e+00 10000       0
## 2 -1.166038e+01 8.663375e+00 -1.658135e+01 7.241891e+00 10000       0
## 3  1.094442e+02 1.162784e+02  1.081037e+02 1.992524e+00 10000       0
## 4  9.792637e-03 3.566888e-02  6.931131e-03 5.265433e-03 10000       0
## 5  8.949981e+00 3.347433e+01  6.342227e+00 4.861508e+00 10000       0
## 6  1.000000e+00 1.000000e+00  1.000000e+00 0.000000e+00 10000       0
## 7  9.050000e+02 9.050000e+02  9.050000e+02 0.000000e+00 10000       0
## 8  1.000000e+00 1.000000e+00  1.000000e+00 0.000000e+00 10000       0
## 9  7.500250e+03 1.000000e+04  5.000500e+03 2.886896e+03 10000       0
qdata(~GeschlechtMann, p=c(0.025,0.975), data=Bootvtlg)
##       2.5%      97.5% 
## -30.673886  -2.385298
#multible Regression

abw.stipro<-diffmean(Ausgaben~Geschlecht, data=newselect_age)

#Simulation der Daten unter der Annahme H0
set.seed(2021)
Nullvtlg_metr<-do(10000)*diffmean(Ausgaben~shuffle(Geschlecht), data=newselect_age)
vQN_m<-qdata(~diffmean, data=Nullvtlg_metr, p=c(0.025,0.975))

gf_histogram(~diffmean, data=Nullvtlg_metr) %>%
  gf_vline(xintercept = ~vQN_m[1]) %>%
  gf_vline(xintercept = ~vQN_m[2]) %>%
  gf_vline(xintercept = ~abw.stipro, color="red")

PWert_m<-prop(~abs(diffmean)>=abs(abw.stipro), data=Nullvtlg_metr)