Importing the Libraries
library(reticulate)
use_python("C:/Users/AKM098/AppData/Local/anaconda3/python.exe")
use_virtualenv("r-reticulate")
## Warning: Previous request to
## `use_python("C:/Users/AKM098/AppData/Local/anaconda3/python.exe", required =
## TRUE)` will be ignored. It is superseded by request to
## `use_python("C:/Users/AKM098/OneDrive - Maersk
## Group/Documents/.virtualenvs/r-reticulate/Scripts/python.exe")
getwd()
## [1] "C:/Users/AKM098/OneDrive - Maersk Group/Data_Science/End-to-End-implementation"
setwd("C:\\Users\\AKM098\\OneDrive - Maersk Group\\documents_old\\LLM\\language_detector\\data")
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.simplefilter("ignore")
# Loading the dataset
data = pd.read_csv("C:\\Users\\AKM098\\OneDrive - Maersk Group\\documents_old\\LLM\\language_detector\\data\\Language Detection.csv")
data.tail(10)
## Text Language
## 10327 ಗಾಡಿಯಲ್ಲಿ ಮನೆಯಲ್ಲಿ ನಾರ್ಸಿಸ್ ಅವಳು ಮನೆಗೆ ತಲುಪಿದಾ... Kannada
## 10328 ಅವಳು ಮನೆಯಲ್ಲಿ ಕುಳಿತಿದ್ದ ತನ್ನ ತಾಯಿಯನ್ನು ತಬ್ಬಿಕೊ... Kannada
## 10329 ಓಹ್ ತಾಯಿ ನಾನು ನಿನ್ನನ್ನು ತುಂಬಾ ಪ್ರೀತಿಸುತ್ತೇನೆ ಮ... Kannada
## 10330 ಅವನು ಸುಂದರವಾಗಿದ್ದನು, ಅವನು ನನ್ನ ಬಗ್ಗೆ ಅಷ್ಟಾಗಿ ಕ... Kannada
## 10331 ಓಹ್ ಇಲ್ಲ ನೀವು ನೋಡಿದಾಗಿನಿಂದ ಇದು ಸರಿಯಾಗಿದೆ. Kannada
## 10332 ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ... Kannada
## 10333 ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್... Kannada
## 10334 ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ... Kannada
## 10335 ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ... Kannada
## 10336 ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು... Kannada
# value count for each language
data["Language"].value_counts()
## Language
## English 1385
## French 1014
## Spanish 819
## Portugeese 739
## Italian 698
## Russian 692
## Sweedish 676
## Malayalam 594
## Dutch 546
## Arabic 536
## Turkish 474
## German 470
## Tamil 469
## Danish 428
## Kannada 369
## Greek 365
## Hindi 63
## Name: count, dtype: int64
# separating the independent and dependant features
X = data["Text"]
y = data["Language"]
y[:3]
## 0 English
## 1 English
## 2 English
## Name: Language, dtype: object
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# converting categorical variables to numerical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y[:10]
## array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
# creating bag of words using countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X).toarray()
X.shape
## (10337, 39928)
X[:5]
## array([[0, 0, 0, ..., 0, 0, 0],
## [0, 0, 0, ..., 0, 0, 0],
## [0, 0, 0, ..., 0, 0, 0],
## [0, 0, 0, ..., 0, 0, 0],
## [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
Train Test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
x_train.shape
## (8269, 39928)
x_test.shape
## (2068, 39928)
x_test.shape
## (2068, 39928)
Model creation and Prediction
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# prediction
y_pred = model.predict(x_test)
Evaluating the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print("Accuracy is :",ac)
## Accuracy is : 0.9787234042553191
# classification report
print(cr)
## precision recall f1-score support
##
## 0 1.00 0.96 0.98 117
## 1 1.00 0.96 0.98 81
## 2 1.00 1.00 1.00 105
## 3 0.88 1.00 0.93 271
## 4 1.00 0.99 0.99 216
## 5 1.00 0.96 0.98 92
## 6 1.00 0.99 0.99 71
## 7 1.00 1.00 1.00 13
## 8 0.99 0.99 0.99 143
## 9 1.00 0.99 0.99 71
## 10 0.98 0.98 0.98 116
## 11 1.00 0.97 0.99 146
## 12 1.00 0.96 0.98 142
## 13 0.99 0.98 0.99 168
## 14 0.99 0.98 0.99 126
## 15 1.00 0.96 0.98 95
## 16 1.00 0.95 0.97 95
##
## accuracy 0.98 2068
## macro avg 0.99 0.98 0.98 2068
## weighted avg 0.98 0.98 0.98 2068
Model Saving
# saving both cv and model
#pickle.dump(cv, open("transform.pkl", "wb"))
#pickle.dump(model, open("model.pkl", "wb"))
create pipeline
# Loading the dataset
#data = pd.read_csv("Language Detection.csv")
# separating the independent and dependant features
X = data["Text"]
y = data["Language"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
# Custom function to convert sparse matrix to dense array
def to_array(X):
return X.toarray()
# Define the pipeline steps
steps = [
('vectorizer', CountVectorizer()),
('to_array', FunctionTransformer(to_array, accept_sparse=True)), # Convert to dense array
('classifier', MultinomialNB())
]
# Create the pipeline
pipe = Pipeline(steps)
# Train the pipeline
pipe.fit(X_train, y_train)
Pipeline(steps=[('vectorizer', CountVectorizer()),
('to_array',
FunctionTransformer(accept_sparse=True,
func=<function to_array at 0x000001DF75652A20>)),
('classifier', MultinomialNB())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = pipe.predict(X_test)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print("Accuracy is :",ac)
## Accuracy is : 0.9840425531914894
print(classification_report(y_test, y_pred))
## precision recall f1-score support
##
## Arabic 1.00 0.97 0.98 116
## Danish 1.00 0.93 0.96 87
## Dutch 1.00 0.98 0.99 97
## English 0.91 1.00 0.95 279
## French 0.99 1.00 0.99 221
## German 1.00 0.99 0.99 75
## Greek 1.00 0.99 0.99 84
## Hindi 1.00 0.90 0.95 10
## Italian 1.00 0.99 1.00 136
## Kannada 1.00 0.95 0.97 77
## Malayalam 1.00 0.99 1.00 114
## Portugeese 0.99 1.00 1.00 136
## Russian 1.00 0.97 0.98 158
## Spanish 1.00 0.99 0.99 151
## Sweedish 0.99 1.00 1.00 135
## Tamil 1.00 1.00 1.00 100
## Turkish 1.00 0.96 0.98 92
##
## accuracy 0.98 2068
## macro avg 0.99 0.98 0.98 2068
## weighted avg 0.99 0.98 0.98 2068
#import pickle
#create pickle file
#pickle.dump(pipe, open('tr_pipe_lng_model.0.1.pkl', 'wb'))
#pkl_model = pickle.load(open('tr_pipe_lng_model.0.1.pkl', 'rb'))
val = ['¿Tienes sentido común?']
#spiste morgenmad
pipe.predict(val)
## array(['Spanish'], dtype='<U10')