Sow_Date_Birth
Lots of missing values
Many sows that were born a long time ago. Doesn’t make sense considering the lifespan of a sow
Looking for some correlations.

Looking at the categorical variables now
Sow_GeneticLine
Sow_RemovalReason
Sow_Date_BreedingHerd_Removed
Lots of weeks with a lot of observartions
Some observations from the 90’s and early 2000
Let’s look the observations by year

Sow_Date_Mating_First
One observations from 1961
Sow_GeneticLine

Many outliers.
Some numbers are really big. Let do a logarithm transformation.

Not outliers can be seen.
Sow_IsExcludedFromAdjustedFarrowingRate

Lots of missing values Most sows are excluded from adjusted farrowing rate.
Sow_Parity_Current.

Sow_Parity_Start

Sow_RemovalReason

Sow_Service_Current

Sow_Tattoo

Sow_Date_BreedingHerd_Available
---
title: "Outliers 2"
output: 
  html_notebook:
    theme: united
    toc: yes
    toc_float: true
---

```{r, echo=FALSE}
library(tidyverse)
library(plotly)
library(lubridate)
install.packages("GGally")
library(GGally)
install.packages("DataExplorer")
install.packages("igraph")
library(DataExplorer)
```


## Sow_Date_Birth


```{r,echo=FALSE,warning=FALSE,message=FALSE,fig.width=7.5, fig.height=4.2, warning=FALSE}


dimsow %>% count(Sow_Date_Birth,sort=TRUE) %>%
  mutate(Sow_Date_Birth = ymd(Sow_Date_Birth)) %>% 
 group_by(week=floor_date(Sow_Date_Birth, "week")) %>% 
  summarize(n=sum(n)) %>% 
  plot_ly( x = ~week) %>% add_trace(y = ~n, name = 'trace 0',mode = 'lines')%>% 
  layout(title = "Sow Date Birth") 




```

```{r,echo=FALSE,message=FALSE,warning=FALSE}

x<- dimsow %>% count(Sow_Date_Birth ,sort=TRUE) %>%
  mutate(Sow_Date_Birth  = ymd(Sow_Date_Birth )) %>% 
 group_by(year=floor_date(Sow_Date_Birth , "year")) %>% 
  summarize(n=sum(n)) %>%  separate(year,into=c("year","week"),sep="-") %>% select(year,n)%>% rename(Sows=n)



DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))



```

Lots of missing values

Many sows that were born a long time ago. Doesn't make sense considering the lifespan of a sow

Looking for some correlations.
```{r,echo=FALSE,warning=FALSE}

dimsow %>%  mutate(Sow_Date_Birth=ifelse(str_detect(Sow_Date_Birth,"1951"),NA,Sow_Date_Birth)) %>% 
  mutate(outlier=ifelse(str_detect(Sow_Date_Birth,"1952|1955|1959|1965|1971|1973|1976|1983|1984|1985|1988|1989|1990|1991|1992|1993|1994|1995|1996|1997|1998"),"Outliers","Not Outlier")) %>%  select(outlier,Sow_Service_Current,Sow_Parity_Start,Sow_Parity_Current) %>% 
  pivot_longer(Sow_Service_Current:Sow_Parity_Current, names_to = "stat", values_to = "value") %>%
  ggplot(aes(outlier, value, fill = outlier, color = outlier)) +
  geom_boxplot(alpha = 0.4) +
  facet_wrap(~stat, scales = "free_y", nrow = 2) +
  labs(y = NULL, color = NULL, fill = NULL)+theme_light()

```

Looking at the categorical variables now

Sow_GeneticLine

```{r,echo=FALSE}

x<-dimsow %>% mutate(Sow_Date_Birth=na_if(Sow_Date_Birth,"1951-01-01")) %>% 
mutate(outlier=ifelse(str_detect(Sow_Date_Birth,"1952|1955|1959|1965|1971|1973|1976|1983|1984|1985|1988|1989|1990|1991|1992|1993|1994|1995|1996|1997|1998"),"Outliers","No.Outlier")) %>% mutate(Sow_IsActive=as.factor(Sow_IsActive))%>%
  select(outlier, Sow_GeneticLine)%>% na.omit() %>%   mutate(Sow_GeneticLine = fct_lump(Sow_GeneticLine, n = 70)) %>%
  count(Sow_GeneticLine,outlier,sort=TRUE) %>% pivot_wider(names_from = outlier,values_from=n) %>% 
  mutate(Percentage.Outliers=round((Outliers*100)/(No.Outlier + Outliers),1)) %>% mutate(Percentage.Outliers=ifelse(is.na(Percentage.Outliers),0,Percentage.Outliers))

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))
  

```



Sow_RemovalReason

```{r,echo=FALSE}

x<-dimsow %>% mutate(Sow_Date_Birth=na_if(Sow_Date_Birth,"1951-01-01")) %>% 
mutate(outlier=ifelse(str_detect(Sow_Date_Birth,"1952|1955|1959|1965|1971|1973|1976|1983|1984|1985|1988|1989|1990|1991|1992|1993|1994|1995|1996|1997|1998"),"Outliers","No.Outlier")) %>% mutate(Sow_IsActive=as.factor(Sow_IsActive))%>%
  select(outlier, Sow_RemovalReason)%>% na.omit() %>%   mutate(Sow_RemovalReason = fct_lump(Sow_RemovalReason, n = 70)) %>%
  count(Sow_RemovalReason,outlier,sort=TRUE) %>% pivot_wider(names_from = outlier,values_from=n) %>% 
  mutate(Percentage.Outliers=round((Outliers*100)/(No.Outlier + Outliers),1)) %>% mutate(Percentage.Outliers=ifelse(is.na(Percentage.Outliers),0,Percentage.Outliers))

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```



## Sow_Date_BreedingHerd_Removed

```{r,echo=FALSE,message=FALSE,fig.width=7.5, fig.height=4.2, warning=FALSE}


dimsow %>% count(Sow_Date_BreedingHerd_Removed,sort=TRUE) %>%
  mutate(Sow_Date_BreedingHerd_Removed = ymd(Sow_Date_BreedingHerd_Removed)) %>% 
 group_by(week=floor_date(Sow_Date_BreedingHerd_Removed, "week")) %>% 
  summarize(n=sum(n)) %>% 
  plot_ly( x = ~week) %>% add_trace(y = ~n, name = 'trace 0',mode = 'lines')%>% 
  layout(title = "Sow_Date_BreedingHerd_Removed") 




```


Lots of weeks with a lot of observartions

Some observations from the 90's and early 2000

Let's look the observations by year


```{r,echo=FALSE,warning=FALSE}

x<- dimsow %>% count(Sow_Date_BreedingHerd_Removed ,sort=TRUE) %>%
  mutate(Sow_Date_BreedingHerd_Removed  = ymd(Sow_Date_BreedingHerd_Removed )) %>% 
 group_by(year=floor_date(Sow_Date_BreedingHerd_Removed , "year")) %>% 
  summarize(n=sum(n)) %>%  separate(year,into=c("year","week"),sep="-") %>% select(year,n)%>% rename(Sows=n)



DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```


```{r,echo=FALSE,warning=FALSE}

dimsow %>% 
  mutate(outlier=ifelse(str_detect(Sow_Date_BreedingHerd_Removed,"1990|1991|1992|1993|1994|1995|1996|1997|1998"),"Outliers","Not Outlier")) %>%  select(outlier,Sow_Service_Current,Sow_Parity_Start,Sow_Parity_Current) %>% 
  pivot_longer(Sow_Service_Current:Sow_Parity_Current, names_to = "stat", values_to = "value") %>%
  ggplot(aes(outlier, value, fill = outlier, color = outlier)) +
  geom_boxplot(alpha = 0.4) +
  facet_wrap(~stat, scales = "free_y", nrow = 2) +
  labs(y = NULL, color = NULL, fill = NULL)+theme_light()

```


```{r,echo=FALSE}

x<-dimsow %>% 
mutate(outlier=ifelse(str_detect(Sow_Date_BreedingHerd_Removed,"1990|1991|1992|1993|1994|1995|1996|1997|1998"),"Outliers","No.Outlier")) %>% mutate(Sow_IsActive=as.factor(Sow_IsActive))%>%
  select(outlier, Sow_GeneticLine)%>% na.omit() %>%   mutate(Sow_GeneticLine = fct_lump(Sow_GeneticLine, n = 70)) %>%
  count(Sow_GeneticLine,outlier,sort=TRUE) %>% pivot_wider(names_from = outlier,values_from=n) %>% 
  mutate(Percentage.Outliers=round((Outliers*100)/(No.Outlier + Outliers),1)) %>% mutate(Percentage.Outliers=ifelse(is.na(Percentage.Outliers),0,Percentage.Outliers))

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))
  

```



```{r,echo=FALSE}

x<-dimsow %>%
mutate(outlier=ifelse(str_detect(Sow_Date_BreedingHerd_Removed,"1990|1991|1992|1993|1994|1995|1996|1997|1998"),"Outliers","No.Outlier")) %>% mutate(Sow_IsActive=as.factor(Sow_IsActive))%>%
  select(outlier, Sow_RemovalReason)%>% na.omit() %>%   mutate(Sow_RemovalReason = fct_lump(Sow_RemovalReason, n = 70)) %>%
  count(Sow_RemovalReason,outlier,sort=TRUE) %>% pivot_wider(names_from = outlier,values_from=n) %>% 
  mutate(Percentage.Outliers=round((Outliers*100)/(No.Outlier + Outliers),1)) %>% mutate(Percentage.Outliers=ifelse(is.na(Percentage.Outliers),0,Percentage.Outliers))

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))
  

```


## Sow_Date_Mating_First                      


```{r,echo=FALSE,message=FALSE,fig.width=7.5, fig.height=4.2, warning=FALSE}


dimsow %>% count(Sow_Date_Mating_First,sort=TRUE) %>%
  mutate(Sow_Date_Mating_First = ymd(Sow_Date_Mating_First)) %>% 
 group_by(week=floor_date(Sow_Date_Mating_First, "week")) %>% 
  summarize(n=sum(n)) %>% 
  plot_ly( x = ~week) %>% add_trace(y = ~n, name = 'trace 0',mode = 'lines')%>% 
  layout(title = "Sow_Date_Mating_First ") 


```

One observations from 1961


```{r,echo=FALSE,nessage=FALSE, warning =FALSE}

x<- dimsow %>% count(Sow_Date_Mating_First ,sort=TRUE) %>%
  mutate(Sow_Date_Mating_First  = ymd(Sow_Date_Mating_First )) %>% 
 group_by(year=floor_date(Sow_Date_Mating_First , "year")) %>% 
  summarize(n=sum(n)) %>%  separate(year,into=c("year","week"),sep="-") %>% select(year,n)%>% rename(Sows=n)



DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```



## Sow_GeneticLine 

```{r,echo=FALSE}
x<-dimsow %>% count(Sow_GeneticLine,sort=TRUE) %>% rename(Observations=n)

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```



```{r,echo=FALSE}

dimsow%>% count(Sow_GeneticLine,sort=TRUE) %>% ggplot(aes(y=(n))) + 
  geom_boxplot()+theme_light()+coord_flip()+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),axis.ticks.y=element_blank())+ggtitle("Number of Sows by Genetic Line")

```

Many outliers.

Some numbers are really big. Let do a logarithm transformation. 


```{r,echo=FALSE}

dimsow%>% count(Sow_GeneticLine,sort=TRUE) %>% ggplot(aes(y=log(n))) + 
  geom_boxplot()+theme_light()+coord_flip()+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),axis.ticks.y=element_blank())+ggtitle("Number of Sows by Genetic Line")

```

Not outliers can be seen.


## Sow_HerdCategory

```{r,echo=FALSE}

x<-dimsow %>% count(Sow_HerdCategory)


DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```


## Sow_IsExcludedFromAdjustedFarrowingRate 


```{r,echo = FALSE, warning=FALSE, message=FALSE}


ggplot(dimsow, mapping = aes(x = as.factor(Sow_IsExcludedFromAdjustedFarrowingRate))) + 
  geom_bar()+   scale_y_continuous(labels = scales::comma)+
  labs(x="Sow_IsExcludedFromAdjustedFarrowingRate")+
theme_light()



```

Lots of missing values
Most sows are excluded from adjusted farrowing rate.

## Sow_Parity_Current.


```{r,echo=FALSE}

dimsow %>% ggplot(aes(y=( Sow_Parity_Current))) + 
  geom_boxplot()+theme_light()+coord_flip()+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),axis.ticks.y=element_blank())+ggtitle("Sow Parity Current")

```


## Sow_Parity_Start 

```{r,echo=FALSE}

dimsow %>% ggplot(aes(y=Sow_Parity_Start)) + 
  geom_boxplot()+theme_light()+coord_flip()+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),axis.ticks.y=element_blank())+ggtitle("Sow Parity Start")
```

## Sow_RemovalReason 

```{r,echo=FALSE}
x<-dimsow %>% count(Sow_RemovalReason,sort=TRUE)

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```


```{r,echo=FALSE}

dimsow%>% count(Sow_RemovalReason,sort=TRUE) %>% ggplot(aes(y=(n))) + 
  geom_boxplot()+theme_light()+coord_flip()+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),axis.ticks.y=element_blank())+ggtitle("Number of Sows by Removal Reason")

```


## Sow_Service_Current 

```{r,echo=FALSE}
x<-dimsow %>% count(Sow_Service_Current)

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))


```

```{r,echo=FALSE,message=FALSE,warning=FALSE}

dimsow %>% ggplot(aes(y=(Sow_Service_Current))) + 
  geom_boxplot()+theme_light()+coord_flip()+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),axis.ticks.y=element_blank())+
  ggtitle("Sow Service Current")

```

## Sow_Status   

```{r,echo=FALSE}

x<-dimsow %>% count(Sow_Status,sort=TRUE)

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```

## Sow_Tattoo

```{r,echo=FALSE,message=FALSE, warning=FALSE}

x<-dimsow %>% count(Sow_Tattoo,sort=TRUE)

DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))

```



```{r,echo=FALSE}

dimsow %>% filter(is.na(Sow_Tattoo)==FALSE)%>% count(Sow_Tattoo,sort=TRUE) %>% ggplot(aes(y=(n))) + 
  geom_boxplot()+theme_light()+coord_flip()+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),axis.ticks.y=element_blank())+ggtitle("Sow_Tattoo")

```

## Sow_Date_LastModified 

```{r,echo=FALSE,warning=FALSE,message=FALSE,fig.width=7.5, fig.height=4.2}


dimsow %>% 
  mutate(Sow_Date_LastModified = ymd(Sow_Date_LastModified)) %>% 
 group_by(week=floor_date(Sow_Date_LastModified, "week")) %>% 
  summarize(n=n()) %>% 
  plot_ly( x = ~week) %>% add_trace(y = ~n, name = 'trace 0',mode = 'lines')%>% 
  layout(title = "Sow_Date_LastModified") 




```

## Sow_Date_BreedingHerd_Available

```{r,echo=FALSE,warning=FALSE,message=FALSE,fig.width=7.5, fig.height=4.2}

dimsow %>% 
  mutate(Sow_Date_BreedingHerd_Available = ymd(Sow_Date_BreedingHerd_Available)) %>% 
 group_by(week=floor_date(Sow_Date_BreedingHerd_Available, "week")) %>% 
  summarize(n=n()) %>% 
  plot_ly( x = ~week) %>% add_trace(y = ~n, name = 'trace 0',mode = 'lines')%>% 
  layout(title = "Sow_Date_BreedingHerd_Available") 

```

## Sow_Date_Identity

```{r,echo=FALSE,warning=FALSE,message=FALSE,fig.width=7.5, fig.height=4.2}

dimsow %>% 
  mutate(Sow_Date_Identity = ymd(Sow_Date_Identity)) %>% 
 group_by(week=floor_date(Sow_Date_Identity, "week")) %>% 
  summarize(n=n()) %>% 
  plot_ly( x = ~week) %>% add_trace(y = ~n, name = 'trace 0',mode = 'lines')%>% 
  layout(title = "Sow_Date_Identity") 

```

## Sow_IsActive

```{r,echo=FALSE}

x<-dimsow %>% count(Sow_IsActive)


DT::datatable(x, rownames = FALSE,
          options = list(
            columnDefs = list(list(className = 'dt-center', targets = 0:1))))


```





















