Exercício 12 [Visualização de Dados] by Arthur Macedo

Questões

Questão 1

# Dados fornecidos
MRT_1F <- c(517.1468515630205, 85.13094142168089, 30.333207896694553, 12.694776264558937, 3.3041601673945418, 1.1823111717498882, 1.1892293502386786)
MRT_3F <- c(156.68929936163462, 11.540837783562276, 0.4512835621696538, 0.4509797929766453, 0.4502068233039181, 0.4496185276300172, 0.4543157082191288)
MRT_5F <- c(83.90319666471157, 0.3068151086494968, 0.30522314133037304, 0.3072588968084928, 0.30655265997285697, 0.3055812715727718, 0.3053297166713006)
MRT_10F <- c(29.55430642951759, 0.19832832665772515, 0.1971923924717474, 0.19796648905716516, 0.19615594370806338, 0.2034569237883263, 0.19617420889447737)
MRT_15F <- c(11.317736530583566, 0.167364215666193, 0.16172168266811013, 0.16701085329580515, 0.1598052657153692, 0.1645934043532696, 0.16216563797118075)
MRT_sem_F <- c(11.93430909937736, 0.6095414637034009, 0.6060645101029295, 0.612167181646899, 0.6146761002685637, 0.6096747087200697, 0.6125810476877268)
clock <- c(0.1, 0.5, 1, 1.5, 2, 2.5, 3)

# Gráfico 1: Linhas
plot(clock, MRT_1F, type="o", col="red", xlab="Clock (GHz)", ylab="MRT", main="MRT vs Clock")
lines(clock, MRT_3F, type="o", col="blue")
lines(clock, MRT_5F, type="o", col="green")
lines(clock, MRT_10F, type="o", col="purple")
lines(clock, MRT_15F, type="o", col="orange")
lines(clock, MRT_sem_F, type="o", col="brown")
legend("topright", legend=c("MRT_1F", "MRT_3F", "MRT_5F", "MRT_10F", "MRT_15F", "MRT_sem_F"), 
       col=c("red", "blue", "green", "purple", "orange", "brown"), lty=1, cex=0.8)

# Gráfico 2: Barras comparando cada condição com e sem fog
par(mfrow=c(3, 2)) # Organizando os gráficos em layout 3x2

# 1F vs Sem Fog
barplot(height = rbind(MRT_1F, MRT_sem_F), beside=TRUE, 
        col=c("#E6E6E6", "#666666"), log="y", names.arg=clock, 
        main="MRT_1F vs Sem Fog", xlab="Clock (GHz)", ylab="MRT")
legend("topright", legend=c("MRT_1F", "Sem Fog"), 
       fill=c("#E6E6E6", "#666666"), cex=0.8)

# 3F vs Sem Fog
barplot(height = rbind(MRT_3F, MRT_sem_F), beside=TRUE, 
        col=c("#E6E6E6", "#666666"), log="y", names.arg=clock, 
        main="MRT_3F vs Sem Fog", xlab="Clock (GHz)", ylab="MRT")
legend("topright", legend=c("MRT_3F", "Sem Fog"), 
       fill=c("#E6E6E6", "#666666"), cex=0.8)

# 5F vs Sem Fog
barplot(height = rbind(MRT_5F, MRT_sem_F), beside=TRUE, 
        col=c("#E6E6E6", "#666666"), log="y", names.arg=clock, 
        main="MRT_5F vs Sem Fog", xlab="Clock (GHz)", ylab="MRT")
legend("topright", legend=c("MRT_5F", "Sem Fog"), 
       fill=c("#E6E6E6", "#666666"), cex=0.8)

# 10F vs Sem Fog
barplot(height = rbind(MRT_10F, MRT_sem_F), beside=TRUE, 
        col=c("#E6E6E6", "#666666"), log="y", names.arg=clock, 
        main="MRT_10F vs Sem Fog", xlab="Clock (GHz)", ylab="MRT")
legend("topright", legend=c("MRT_10F", "Sem Fog"), 
       fill=c("#E6E6E6", "#666666"), cex=0.8)

# 15F vs Sem Fog
barplot(height = rbind(MRT_15F, MRT_sem_F), beside=TRUE, 
        col=c("#E6E6E6", "#666666"), log="y", names.arg=clock, 
        main="MRT_15F vs Sem Fog", xlab="Clock (GHz)", ylab="MRT")
legend("topright", legend=c("MRT_15F", "Sem Fog"), 
       fill=c("#E6E6E6", "#666666"), cex=0.8)

Questão 2

# Criando a tabela de dados
data <- data.frame(
  Quality_Rating = c("Good", "Very Good", "Excellent"),
  `10-19` = c(53.8, 43.6, 2.6),
  `20-29` = c(33.9, 54.2, 11.9),
  `30-39` = c(2.6, 60.5, 36.8),
  `40-49` = c(0.5, 21.4, 78.6)
)

# Transpondo os dados para que fiquem no formato apropriado para o gráfico de barras empilhadas
data_long <- melt(data, id.vars="Quality_Rating")

# Gerando o gráfico de barras empilhadas
ggplot(data_long, aes(x=variable, y=value, fill=Quality_Rating)) + 
  geom_bar(stat="identity") +
  labs(title="Qualidade da Refeição por Faixa de Preço",
       x="Faixa de Preço ($)",
       y="Percentual",
       fill="Classificação de Qualidade") +
  theme_minimal()

Questão 3

# Carregar o dataset airquality
data("airquality")

# Filtrar as temperaturas do mês de maio
may_temps <- airquality$Temp[airquality$Month == 5]

# Converter para graus Celsius
may_temps_celsius <- (may_temps - 32) / 1.8

# Gerar o histograma com curva de densidade
ggplot(data.frame(TempC = may_temps_celsius), aes(x = TempC)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, fill = "skyblue", color = "black") +
  geom_density(color = "red", size = 1) +
  labs(title = "Histograma das Temperaturas em Maio (Celsius)",
       x = "Temperatura (°C)",
       y = "Densidade") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Questão 4

# Criar o dataset de vendas manualmente com base nos dados fornecidos
sales <- data.frame(
  COUNTRY = c("US", "UK", "France", "Poland", "Japan", "China"),
  SALES = c(340, 290, 510, 820, 120, 780)
)

# Verificar os valores de vendas
print(sales)

##   COUNTRY SALES
## 1      US   340
## 2      UK   290
## 3  France   510
## 4  Poland   820
## 5   Japan   120
## 6   China   780

# Calcular a porcentagem de vendas por país
sales_percent <- round(100 * sales$SALES / sum(sales$SALES), 1)

# Garantir que as vendas sejam todas positivas
if(any(sales$SALES <= 0)){
  stop("Existem valores negativos ou zero em sales$SALES. Corrija os dados antes de prosseguir.")
}

# Definir as cores das fatias
colors <- c("skyblue", "orange", "green", "red", "purple", "yellow")

# Criar o gráfico de pizza
pie(sales$SALES, labels = paste(sales$COUNTRY, sales_percent, "%"), col = colors, main = "Porcentagem de Vendas por País")

# Adicionar a legenda
legend("topright", legend = sales$COUNTRY, fill = colors)

Questão 5

# Carregar o dataset InsectSprays
data("InsectSprays")

# Criar o boxplot sem outliers, com caixas preenchidas em "yellow"
ggplot(InsectSprays, aes(x = spray, y = count)) +
  geom_boxplot(outlier.shape = NA, fill = "yellow") +
  labs(title = "Contagem de Insetos por Tipo de Inseticida",
       x = "Tipo de Inseticida",
       y = "Contagem de Insetos") +
  theme_minimal()

Questão 6

# Carregar os datasets
df1 <- read.csv("monitoringCloudData_0.1.csv")
df2 <- read.csv("monitoringCloudData_0.5.csv")
df3 <- read.csv("monitoringCloudData_1.csv")
df4 <- read.csv("monitoringCloudData_NONE.csv")

# Função para converter os valores de memória para MB
convert_to_mb <- function(memory_str) {
  if (grepl("GB", memory_str)) {
    return(as.numeric(sub("GB", "", memory_str)) * 1024)
  } else if (grepl("MB", memory_str)) {
    return(as.numeric(sub("MB", "", memory_str)))
  } else if (grepl("KB", memory_str)) {
    return(as.numeric(sub("KB", "", memory_str)) / 1024)
  } else if (grepl("TB", memory_str)) {
    return(as.numeric(sub("TB", "", memory_str)) * 1024 * 1024)
  } else {
    return(as.numeric(memory_str))
  }
}

# Aplicar a conversão para cada dataframe
df1$usedMemory <- sapply(df1$usedMemory, convert_to_mb)
df2$usedMemory <- sapply(df2$usedMemory, convert_to_mb)
df3$usedMemory <- sapply(df3$usedMemory, convert_to_mb)
df4$usedMemory <- sapply(df4$usedMemory, convert_to_mb)

# Converter 'currentTime' para datetime e calcular a diferença em horas
df1$currentTime <- as.POSIXct(df1$currentTime, tz = "UTC")
df2$currentTime <- as.POSIXct(df2$currentTime, tz = "UTC")
df3$currentTime <- as.POSIXct(df3$currentTime, tz = "UTC")
df4$currentTime <- as.POSIXct(df4$currentTime, tz = "UTC")

df1$time_diff <- as.numeric(difftime(df1$currentTime, min(df1$currentTime), units = "hours"))
df2$time_diff <- as.numeric(difftime(df2$currentTime, min(df2$currentTime), units = "hours"))
df3$time_diff <- as.numeric(difftime(df3$currentTime, min(df3$currentTime), units = "hours"))
df4$time_diff <- as.numeric(difftime(df4$currentTime, min(df4$currentTime), units = "hours"))

# Gerar os gráficos
par(mfrow=c(2,2))

plot(df1$time_diff, df1$usedMemory, type="l", col="blue", xlab="Tempo (horas)", ylab="Memória Usada (MB)", main="Memória Usada - 0.1")
plot(df2$time_diff, df2$usedMemory, type="l", col="red", xlab="Tempo (horas)", ylab="Memória Usada (MB)", main="Memória Usada - 0.5")
plot(df3$time_diff, df3$usedMemory, type="l", col="green", xlab="Tempo (horas)", ylab="Memória Usada (MB)", main="Memória Usada - 1")
plot(df4$time_diff, df4$usedMemory, type="l", col="purple", xlab="Tempo (horas)", ylab="Memória Usada (MB)", main="Memória Usada - NONE")

par(mfrow=c(1,1))

Questão 7

# Carregar o dataset
netflix_data <- read.csv("netflix_titles.csv")

# Filtrar para apenas conteúdos com um país de origem
single_country <- netflix_data[!grepl(",", netflix_data$country), ]

# Contar o número de títulos por país
country_count <- table(single_country$country)

# Ordenar e pegar os 10 países com mais conteúdo
top_10_countries <- head(sort(country_count, decreasing = TRUE), 10)

# Converter para dataframe para usar no plotly
top_10_df <- data.frame(
  country = names(top_10_countries),
  count = as.numeric(top_10_countries)
)

# Gerar o gráfico de pizza com Plotly
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

plot_ly(top_10_df, labels = ~country, values = ~count, type = 'pie', textinfo = 'label+percent',
        insidetextorientation = 'radial') %>%
  layout(title = 'Top 10 Países com Mais Conteúdo na Netflix')

Questão 8

# Gerar a tabela com Plotly
library(plotly)

# Criar a tabela em Plotly
plot_ly(
  type = 'table',
  header = list(
    values = c('País', 'Total de Conteúdos'),
    fill = list(color = 'grey'),
    font = list(color = 'white'),
    align = 'center'
  ),
  cells = list(
    values = list(top_10_df$country, top_10_df$count),
    align = 'center'
  )
) %>%
  layout(title = 'Top 10 Países com Mais Conteúdo na Netflix')

Questão 9

# Adicionar uma coluna de década ao dataset
netflix_data$decade <- floor(netflix_data$release_year / 10) * 10

# Filtrar para tipos de conteúdo
movies <- netflix_data[netflix_data$type == "Movie", ]
tv_shows <- netflix_data[netflix_data$type == "TV Show", ]

# Contar o número de títulos por década para cada tipo
movies_count <- aggregate(movies$title, by=list(movies$decade), FUN=length)
tv_shows_count <- aggregate(tv_shows$title, by=list(tv_shows$decade), FUN=length)

# Renomear as colunas para facilitar o plot
colnames(movies_count) <- c("decade", "count")
colnames(tv_shows_count) <- c("decade", "count")

# Gerar o gráfico com Plotly
library(plotly)

plot_ly() %>%
  add_lines(x = movies_count$decade, y = movies_count$count, type = 'scatter', mode = 'lines+markers', name = 'Filmes', line = list(color = 'yellow')) %>%
  add_lines(x = tv_shows_count$decade, y = tv_shows_count$count, type = 'scatter', mode = 'lines+markers', name = 'Séries', line = list(color = 'blue')) %>%
  layout(title = 'Quantidade de Conteúdo por Década na Netflix',
         xaxis = list(title = 'Década'),
         yaxis = list(title = 'Quantidade de Conteúdo'))

Questão 10

# Filtrar os dados para os filmes lançados entre 2000 e 2010
filtered_data <- subset(netflix_data, type == "Movie" & release_year >= 2000 & release_year <= 2010)

# Considerar apenas o primeiro gênero listado
filtered_data$primary_genre <- sapply(strsplit(filtered_data$listed_in, ","), `[`, 1)

# Filtrar para os gêneros desejados
genres_of_interest <- c("Dramas", "Action & Adventure", "Comedies")
filtered_data <- subset(filtered_data, primary_genre %in% genres_of_interest)

# Contar a quantidade de filmes por ano e gênero
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

genre_year_count <- filtered_data %>%
  group_by(release_year, primary_genre) %>%
  summarise(count = n()) %>%
  ungroup()

## `summarise()` has grouped output by 'release_year'. You can override using the
## `.groups` argument.

# Criar o gráfico de barras empilhadas com Plotly
library(plotly)

plot_ly(genre_year_count, x = ~release_year, y = ~count, color = ~primary_genre, type = 'bar', 
        colors = c('Dramas' = 'blue', 'Action & Adventure' = 'orange', 'Comedies' = 'green')) %>%
  layout(title = 'Quantidade de Filmes por Gênero (2000-2010)',
         xaxis = list(title = 'Ano de Lançamento'),
         yaxis = list(title = 'Quantidade de Filmes'),
         barmode = 'group')