import os
import re
import string
import random
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import numpy as np
import os
import string
import random



# Set the path to the folder
folder_path = "C:/Users/Rodrigo/Desktop/txt_sentoken"

# Load the data
reviews = []

for folder_name in os.listdir(folder_path):
    folder = os.path.join(folder_path, folder_name)
    if not os.path.isdir(folder):
        continue
    file_names = os.listdir(folder)
    file_names = random.sample(file_names, 500)  # seleccionar aleatoriamente 500 archivos
    for file_name in file_names:
        file_path = os.path.join(folder, file_name)
        if not file_name.endswith('.txt'):
            continue
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            if folder_name == "pos":
                reviews.append((text, 1))
            else:
                reviews.append((text, 0))

# Text preprocessing
stop_words = set(stopwords.words("english"))
stop_words.update(["movie", "film", "get", "also", "one", "like", "would", "make", "much", "even", "seem"])
ps = PorterStemmer()

def preprocess_text(text):
    text = text.strip()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stop_words]  
    tokens = [ps.stem(token) for token in tokens]
    return " ".join(tokens)

# Apply preprocessing to the reviews
preprocessed_reviews = [preprocess_text(review[0]) for review in reviews]

# Create a Pandas DataFrame with the preprocessed reviews
df = pd.DataFrame(preprocessed_reviews, columns=["Review"])

# Add a column indicating the sentiment (positive or negative)
df["Sentiment"] = [review[1] for review in reviews]

# Calculate the length of each review
df["Length"] = df["Review"].apply(lambda x: len(x.split()))

# Create a bar chart of the top 15 words by frequency
words = " ".join(df["Review"]).split()
word_counts = Counter(words)
top_words = word_counts.most_common(15)
colors = plt.cm.Pastel1(np.arange(len(top_words)))
fig, ax = plt.subplots()
ax.bar([word for word, count in top_words], [count for word, count in top_words], color=colors)
## <BarContainer object of 15 artists>
ax.set_xlabel('Words')
ax.set_ylabel('Frequency')
ax.set_title('Top 15 Words by Frequency')
plt.xticks(rotation=45, ha='right')
## ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [Text(0, 0, 'charact'), Text(1, 0, 'time'), Text(2, 0, 'scene'), Text(3, 0, 'stori'), Text(4, 0, 'film'), Text(5, 0, 'play'), Text(6, 0, 'good'), Text(7, 0, 'see'), Text(8, 0, 'go'), Text(9, 0, 'end'), Text(10, 0, 'way'), Text(11, 0, 'take'), Text(12, 0, 'first'), Text(13, 0, 'come'), Text(14, 0, 'two')])
plt.show()

# Create a histogram of word frequencies

freq_counts = Counter(word_counts.values())
fig, ax = plt.subplots()
ax.bar(freq_counts.keys(), freq_counts.values(), width=10)
## <BarContainer object of 408 artists>
ax.set_xlabel('Word Frequency')
ax.set_ylabel('Number of Words')
ax.set_title('Distribution of Word Frequencies')
ax.set_xlim([0, 50]) # Set x-axis limit
## (0.0, 50.0)
plt.show()


# Get the most common words with a frequency between 0 and 20

most_common_words = [word for word, count in word_counts.most_common() if count <= 20]

# Create a dictionary with the counts of those words
common_counts = {}
for word, count in word_counts.items():
    if word in most_common_words:
        if count in common_counts:
            common_counts[count] += 1
        else:
            common_counts[count] = 1

# Create a bar chart of the most common words
fig, ax = plt.subplots()
ax.bar(common_counts.keys(), common_counts.values())
## <BarContainer object of 20 artists>
ax.set_xlabel('Word Frequency')
ax.set_ylabel('Number of Words')
ax.set_title('Most Common Words (Frequency <= 20)')
plt.show() ##### Esta grafica te ayda ver con mejor vista cual es la distribuicion


# Lista de caracteres especiales

special_chars = ["!", "#", "$", "%", "&", "*", "+", "-", "/", ":", ";", "<", "=", ">", "?", "@", "^", "_", "`", "{", "|", "}", "~"]

# Función para verificar si una revisión contiene caracteres especiales
def contains_special_chars(text):
    for char in special_chars:
        if char in text:
            return True
    return False

# Cuenta el número de revisiones que contienen caracteres especiales
num_reviews_with_special_chars = 0
for review, sentiment in reviews:
    if contains_special_chars(review):
        num_reviews_with_special_chars += 1
        
# Crea un gráfico de barras que muestra el número de revisiones con caracteres especiales y sin ellos
labels = ["Contiene caracteres especiales", "No contiene caracteres especiales"]
values = [num_reviews_with_special_chars, len(reviews) - num_reviews_with_special_chars]
colors = ["red", "green"]
fig, ax = plt.subplots()
ax.bar(labels, values, color=colors)
## <BarContainer object of 2 artists>
ax.set_ylabel("Número de revisiones")
ax.set_title("Revisiones con caracteres especiales")
plt.show()


# Lista de caracteres especiales

special_chars = ["!", "#", "$", "%", "&", "*", "+", "-", "/", ":", ";", "<", "=", ">", "?", "@", "^", "_", "`", "{", "|", "}", "~"]

# Crear un diccionario que almacene la frecuencia de cada carácter especial en las revisiones
char_counts = {char: 0 for char in special_chars}
for review, sentiment in reviews:
    for char in special_chars:
        char_counts[char] += review.count(char)

# Ordenar el diccionario por frecuencia de mayor a menor
char_counts = dict(sorted(char_counts.items(), key=lambda item: item[1], reverse=True))

# Crear listas de etiquetas y alturas de las barras
labels = []
heights = []
for char, count in char_counts.items():
    labels.append(char)
    heights.append(count)

# Crear una gráfica de barras que muestre la frecuencia de los caracteres especiales
fig, ax = plt.subplots()
ax.bar(labels, heights)
## <BarContainer object of 23 artists>
ax.set_xlabel("Carácter Especial")
ax.set_ylabel("Frecuencia")
ax.set_title("Frecuencia de Caracteres Especiales en las Revisiones")
plt.show()
## Traceback (most recent call last):
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\backends\backend_qt.py", line 468, in _draw_idle
##     self.draw()
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\backends\backend_agg.py", line 400, in draw
##     self.figure.draw(self.renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 95, in draw_wrapper
##     result = draw(artist, renderer, *args, **kwargs)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 72, in draw_wrapper
##     return draw(artist, renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\figure.py", line 3140, in draw
##     mimage._draw_list_compositing_images(
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\image.py", line 131, in _draw_list_compositing_images
##     a.draw(renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\artist.py", line 72, in draw_wrapper
##     return draw(artist, renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axes\_base.py", line 3028, in draw
##     self._update_title_position(renderer)
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axes\_base.py", line 2961, in _update_title_position
##     if (ax.xaxis.get_ticks_position() in ['top', 'unknown']
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axis.py", line 2451, in get_ticks_position
##     self._get_ticks_position()]
##   File "C:\Users\Rodrigo\AppData\Local\R-MINI~1\envs\R-RETI~1\lib\site-packages\matplotlib\axis.py", line 2155, in _get_ticks_position
##     major = self.majorTicks[0]
## IndexError: list index out of range

# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_reviews).toarray()

# Get the labels
y = [review[1] for review in reviews]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

#Fit and evaluate a random forest classifier
rf = RandomForestClassifier()
param_grid = {
"n_estimators": [10, 30, 50],
"max_depth": [1, 3, 5, None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 3, 5, None],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [10, 30, 50]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
rf_best = grid_search.best_estimator_
rf_y_pred = rf_best.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")

#Fit and evaluate an SVM classifier
## Random Forest Accuracy: 0.80
svc = SVC()
param_grid = {
"C": [0.1, 1, 3],
"kernel": ["linear", "rbf", "sigmoid"],
"gamma": ["scale", "auto"]
}
grid_search = GridSearchCV(svc, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 3], 'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid']})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
svc_best = grid_search.best_estimator_
svc_y_pred = svc_best.predict(X_test)
svc_accuracy = accuracy_score(y_test, svc_y_pred)
print(f"SVM Accuracy: {svc_accuracy:.2f}")

#Fit and evaluate a k-NN classifier
## SVM Accuracy: 0.82
knn = KNeighborsClassifier()
param_grid = {
"n_neighbors": [3, 5, 7, 9],
"weights": ["uniform", "distance"],
"algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
"leaf_size": [10, 20, 30, 40, 50]
}
grid_search = GridSearchCV(knn, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [10, 20, 30, 40, 50],
                         'n_neighbors': [3, 5, 7, 9],
                         'weights': ['uniform', 'distance']})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
knn_best = grid_search.best_estimator_
knn_y_pred = knn_best.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
print(f"k-NN Accuracy: {knn_accuracy:.2f}")

#Compare the performance of the classifiers
## k-NN Accuracy: 0.65
models = ["Random Forest", "SVM", "k-NN"]
accuracies = [rf_accuracy, svc_accuracy, knn_accuracy]
plt.figure(figsize=(8, 6))
sns.barplot(x=models, y=accuracies)
plt.ylim([0, 1])
## (0.0, 1.0)
plt.title("Classifier Accuracy Comparison")
plt.show()