Homework No.4

Describe Your Data:

  1. 4 erroneous visualizations and correct versions.
  2. 4 lyging/misleading vizualization and correct versions.

Visualizations

The graphs are presented in such order:
first graph is incorrect, and the second is supposed to be correct.

1.1 Junk chart: no legend, no axes, no title, hardly readable values

First caim or reapeted claim?

 ggplot(insurance_data, aes(x = factor(REPEAT5), fill=REPEAT5)) + geom_bar() + geom_text(aes(label = ..count..), stat = "count", colour = "white") +labs(y = " ") + theme_light()

 ggplot(insurance_data, aes(x = factor(REPEAT5), fill=REPEAT5)) + geom_bar() + geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white") + ggtitle("First or reapeted claim") +labs(x = "Firsrt Claim                      More than one claim", fill = "0 - First claim \n 1 - Repeated claim")


Komentaras

1.2 Unknown values presented

Percentage of CUSTUMERS who did car accidents form Urban or Rural areas

data2 <- as.data.frame(table(insurance_data$URBANICITY))
data2 <- data2 %>% 
  arrange(desc(Var1)) %>%
  mutate(prop = Freq / sum(data2$Freq) *100) %>%
  mutate(ypos = cumsum(Freq)- 0.5*Freq )
data2$proc <- paste0(round(data2$prop,1),"%")

ggplot(data2, aes(x="", y=Freq, fill=Var1)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void() + 
  # geom_text(aes(y = ypos, label = Freq), color = "white", size=6) +
  scale_fill_brewer(palette="Dark2", name = " ") + 
  labs(title = "Percentage of CUSTUMERS who did  car accidents form Urban or Rural areas") + 
  theme(legend.position="none")

data1 <- as.data.frame(table(insurance_data$URBANICITY))
data1 <- data1 %>% 
  arrange(desc(Var1)) %>%
  mutate(prop = Freq / sum(data1$Freq) *100) %>%
  mutate(ypos = cumsum(prop)- 0.5*prop )
data1$proc <- paste0(round(data1$prop,1),"%")

ggplot(data1, aes(x="", y=prop, fill=Var1)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void() + 
  geom_text(aes(y = ypos, label = proc), color = "white", size=6) +
  scale_fill_brewer(palette="Dark2", name = "URBANICITY") + 
  labs(title = "Percentage of CUSTUMERS who did  car accidents form Urban or Rural areas")


Komentaras

1.3 Lack of contrast

Percentage of CUSTUMERS who did car accidents form Urban or Rural areas

data1 <- as.data.frame(table(insurance_data$URBANICITY))
data1 <- data1 %>% 
  arrange(desc(Var1)) %>%
  mutate(prop = Freq / sum(data1$Freq) *100) %>%
  mutate(ypos = cumsum(prop)- 0.5*prop )
data1$proc <- paste0(round(data1$prop,1),"%")



ggplot(data1, aes(x="", y=prop, fill=Var1)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void() + 
  geom_text(aes(y = ypos, label = proc), color = "white", size=6) +
  scale_fill_brewer(palette="Light2", name = "URBANICITY") + 
  labs(title = "Percentage of CUSTUMERS who did  car accidents form Urban or Rural areas")
## Warning in pal_name(palette, type): Unknown palette Light2

ggplot(data1, aes(x="", y=prop, fill=Var1)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void() + 
  geom_text(aes(y = ypos, label = proc), color = "white", size=6) +
  scale_fill_brewer(palette="Dark2", name = "URBANICITY") + 
  labs(title = "Percentage of CUSTUMERS who did  car accidents form Urban or Rural areas")

1.4 Wrong graph used

Number of drivers by different groups of Vechles

df <- data.frame(dose=c("Miniva", "Panel Truck", "Pickup", "Sports car", "SUV", "van"),
                len=c(700, 270, 600, 400, 920, 300)) 

ggplot(data=df, aes(x=dose, y=len, group=1)) +
  geom_line(color="red")+
  geom_point()

ggplot(insurance_data, aes(x=CAR_TYPE, y=REPEAT5, fill=CAR_TYPE)) +
  geom_bar(stat="identity") + 
  labs(title = "Number of drivers by different groups of Vechles",
       x = "Car type", y = "Count") + 
  theme_light()


Komentaras

Misleading visualizations

2.1. Wrong proportions

Percentage of CUSTUMERS who did car accidents form Urban or Rural areas

data1 <- as.data.frame(table(insurance_data$URBANICITY))
data1 <- data1 %>% 
  arrange(desc(Var1)) %>%
  mutate(prop = Freq / sum(data1$Freq) *100) %>%
  mutate(ypos = cumsum(prop)- 0.5*prop )
data1$proc <- paste0(round(data1$prop,1),"%")



ggplot(data1, aes(x="", y=c(75,25), fill=Var1)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void() + 
  geom_text(aes(y = ypos, label = proc), color = "white", size=6) +
  scale_fill_brewer(palette="Dark2", name = "URBANICITY") + 
  labs(title = "Percentage of CUSTUMERS who did  car accidents form Urban or Rural areas")

ggplot(data1, aes(x="", y=prop, fill=Var1)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  theme_void() + 
  geom_text(aes(y = ypos, label = proc), color = "white", size=6) +
  scale_fill_brewer(palette="Dark2", name = "URBANICITY") + 
  labs(title = "Percentage of CUSTUMERS who did  car accidents form Urban or Rural areas")


Komentaras

2.2. changing the y-axis

Number of drivers by different groups of Education

ggplot(insurance_data, aes(x=reorder(EDUCATION, -table(EDUCATION)[EDUCATION]), y=REPEAT5, fill=EDUCATION)) +
  geom_bar(stat="identity") +   coord_cartesian(ylim=c(300,1200)) + 
  labs(title = "Number of drivers by different groups of Education",
       x = "Education level", y = "Count") + 
  theme_light()

ggplot(insurance_data, aes(x=reorder(EDUCATION, -table(EDUCATION)[EDUCATION]), y=REPEAT5, fill=EDUCATION)) +
  geom_bar(stat="identity")  + 
  labs(title = "Number of drivers by different groups of Education",
       x = "Education level", y = "Count") + 
  theme_light()


Komentaras

2.3. Wrong use of Logaritmic scale

Relation between Home value, Year Income

ggplot(insurance_data, aes(x=HOME_VAL, y=INCOME)) +
  geom_point() +  labs(title = "Relation between Home value, Year Income",  x = "Home value, $", y = "Yeary Income, $") + 
  theme_light() + 
  scale_x_continuous(trans='log2')
## Warning: Transformation introduced infinite values in continuous x-axis

ggplot(insurance_data, aes(x=HOME_VAL, y=INCOME)) +
  geom_point() +  labs(title = "Relation between Home value, Year Income",  x = "Home value, $", y = "Yeary Income, $") + 
  theme_light()


Komentaras

2.4. Not infirmative bar plot

Drivers Years on current job distribution

ggplot(insurance_data, aes(x=YOJ)) + 
  geom_histogram(color="darkblue", fill="lightblue", bins = 3) +
  labs(title = "Drivers Years on current job distribution", x = "Age, years", y = "Count") + 
  theme_light() 

ggplot(insurance_data, aes(x=YOJ)) + 
  geom_histogram(color="darkblue", fill="lightblue", bins = 24) +
  labs(title = "Drivers Years on current job distribution", x = "Age, years", y = "Count") + 
  theme_light() 


Komentaras