import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sb
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv('C:/Users/gorgu/Downloads/donnees mobile/train.csv')
df.head(10)


len(df)

2000


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_screen   2000 non-null   int64  
 19  wifi           2000 non-null   int64  
 20  price_range    2000 non-null   int64  
dtypes: float64(2), int64(19)
memory usage: 328.2 KB


df.describe()


print(df.isna().sum())

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64


correlation_matrix = df.corr()
sb.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.show()


plt.figure(figsize=(6,6))
heatmap = sb.heatmap(correlation_matrix[['price_range']].sort_values(by='price_range', ascending=False), vmin=-1, vmax=1, annot=True, cmap='RdYlGn' , annot_kws={"fontsize":12})
heatmap.set_title('correlation des variables par rapport le prix ', fontdict={'fontsize':18}, pad=16);


df['price_range'].value_counts().plot(kind='pie',autopct='%.2f%%',title='Diagramme circulaire representant % des prix', ylabel='prix')

<AxesSubplot:title={'center':'Diagramme circulaire representant % des prix'}, ylabel='prix'>


df["price_range"].value_counts()

1    500
2    500
3    500
0    500
Name: price_range, dtype: int64


df["price_range"].value_counts(normalize=True)

1    0.25
2    0.25
3    0.25
0    0.25
Name: price_range, dtype: float64


import plotly.express as px


fig1=px.box(df["price_range"], x="price_range" ,y=df["ram"],)
fig1.show()


from sklearn.model_selection import train_test_split

X = df.drop(columns='price_range')
y = df['price_range']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


len(X_train)

1600


len(y_train)

1600


from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_df = le.fit_transform(y_train)
y_test_df = le.transform(y_test)


from sklearn.preprocessing import OneHotEncoder
import pandas as pd

X_train_df = pd.get_dummies(X_train, drop_first=True, dtype=int)
X_test_df = pd.get_dummies(X_test, drop_first=True, dtype=int)


from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train_df,y_train_df)
y_test_pred=lr.predict(X_test_df)
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Prédire les classes sur l'ensemble de test
y_test_pred = lr.predict(X_test_df)
print("l'accuracy est :",accuracy_score(y_test_df,y_test_pred))
# Calculer la matrice de confusion
conf_matrix = confusion_matrix(y_test_df, y_test_pred)


# Afficher la matrice de confusion avec Seaborn
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Prédiction')
plt.ylabel('Vraie Valeur')
plt.title('Matrice de Confusion')
plt.show()

l'accuracy est : 0.6325

C:\Users\gorgu\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


from sklearn.ensemble import RandomForestClassifier

# Initialisation du modèle de Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Entraînement du modèle sur les données d'entraînement
rf_model.fit(X_train_df, y_train_df)

# Prédiction sur les données de test
y_test_pred_rf = rf_model.predict(X_test_df)

# Calcul de la précision du modèle
accuracy_rf = accuracy_score(y_test_df, y_test_pred_rf)
print("Précision du modèle Random Forest : ", accuracy_rf)

# Calculer la matrice de confusion pour Random Forest
conf_matrix_rf = confusion_matrix(y_test_df, y_test_pred_rf)

# Afficher la matrice de confusion avec Seaborn pour Random Forest
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Prédiction')
plt.ylabel('Vraie Valeur')
plt.title('Matrice de Confusion - Random Forest')
plt.show()

Précision du modèle Random Forest :  0.8925


from sklearn.naive_bayes import MultinomialNB

# Initialisation du modèle Naive Bayes (MultinomialNB)
nb_model = MultinomialNB()

# Entraînement du modèle sur les données d'entraînement
nb_model.fit(X_train_df, y_train_df)

# Prédiction sur les données de test
y_test_pred_nb = nb_model.predict(X_test_df)

# Calcul de la précision du modèle
accuracy_nb = accuracy_score(y_test_df, y_test_pred_nb)
print("Précision du modèle Naive Bayes : ", accuracy_nb)

# Calculer la matrice de confusion pour Naive Bayes
conf_matrix_nb = confusion_matrix(y_test_df, y_test_pred_nb)

# Afficher la matrice de confusion avec Seaborn pour Naive Bayes
sns.heatmap(conf_matrix_nb, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Prédiction')
plt.ylabel('Vraie Valeur')
plt.title('Matrice de Confusion - Naive Bayes')
plt.show()

Précision du modèle Naive Bayes :  0.5475


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Initialisation du modèle KNN
knn_model = KNeighborsClassifier(n_neighbors=3)  # Vous pouvez ajuster le nombre de voisins selon vos besoins

# Entraînement du modèle sur les données d'entraînement
knn_model.fit(X_train_df, y_train_df)

# Prédiction sur les données de test
y_test_pred_knn = knn_model.predict(X_test_df)

# Calcul de la précision du modèle
accuracy_knn = accuracy_score(y_test_df, y_test_pred_knn)
print("Précision du modèle KNN : ", accuracy_knn)

# Calculer la matrice de confusion pour KNN
conf_matrix_knn = confusion_matrix(y_test_df, y_test_pred_knn)

# Afficher la matrice de confusion avec Seaborn pour KNN
sns.heatmap(conf_matrix_knn, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Prédiction')
plt.ylabel('Vraie Valeur')
plt.title('Matrice de Confusion - KNN')
plt.show()

Précision du modèle KNN :  0.9275


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Initialisation du modèle d'arbre de décision
tree_model = DecisionTreeClassifier(random_state=0)

# Entraînement du modèle sur les données d'entraînement
tree_model.fit(X_train_df, y_train_df)

# Prédiction sur les données de test
y_test_pred_tree = tree_model.predict(X_test_df)

# Calcul de la précision du modèle
accuracy_tree = accuracy_score(y_test_df, y_test_pred_tree)
print("Précision du modèle d'arbre de décision : ", accuracy_tree)

# Calculer la matrice de confusion pour l'arbre de décision
conf_matrix_tree = confusion_matrix(y_test_df, y_test_pred_tree)

# Afficher la matrice de confusion avec Seaborn pour l'arbre de décision
sns.heatmap(conf_matrix_tree, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Prédiction')
plt.ylabel('Vraie Valeur')
plt.title("Matrice de Confusion - Arbre de Décision")
plt.show()

Précision du modèle d'arbre de décision :  0.8425


from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Initialisation du modèle SVM
svm_model = SVC(kernel='linear', C=1.0)  # Vous pouvez ajuster le noyau et le paramètre de régularisation C selon vos besoins

# Entraînement du modèle sur les données d'entraînement
svm_model.fit(X_train_df, y_train_df)

# Prédiction sur les données de test
y_test_pred_svm = svm_model.predict(X_test_df)

# Calcul de la précision du modèle
accuracy_svm = accuracy_score(y_test_df, y_test_pred_svm)
print("Précision du modèle SVM : ", accuracy_svm)

# Calculer la matrice de confusion pour SVM
conf_matrix_svm = confusion_matrix(y_test_df, y_test_pred_svm)

# Afficher la matrice de confusion avec Seaborn pour SVM
sns.heatmap(conf_matrix_svm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Prédiction')
plt.ylabel('Vraie Valeur')
plt.title('Matrice de Confusion - SVM')
plt.show()

Précision du modèle SVM :  0.97

	battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	...	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	price_range
0	842	0	2.2	0	1	0	7	0.6	188	2	...	20	756	2549	9	7	19	0	0	1	1
1	1021	1	0.5	1	0	1	53	0.7	136	3	...	905	1988	2631	17	3	7	1	1	0	2
2	563	1	0.5	1	2	1	41	0.9	145	5	...	1263	1716	2603	11	2	9	1	1	0	2
3	615	1	2.5	0	0	0	10	0.8	131	6	...	1216	1786	2769	16	8	11	1	0	0	2
4	1821	1	1.2	0	13	1	44	0.6	141	2	...	1208	1212	1411	8	2	15	1	1	0	1
5	1859	0	0.5	1	3	0	22	0.7	164	1	...	1004	1654	1067	17	1	10	1	0	0	1
6	1821	0	1.7	0	4	1	10	0.8	139	8	...	381	1018	3220	13	8	18	1	0	1	3
7	1954	0	0.5	1	0	0	24	0.8	187	4	...	512	1149	700	16	3	5	1	1	1	0
8	1445	1	0.5	0	0	0	53	0.7	174	7	...	386	836	1099	17	1	20	1	0	0	0
9	509	1	0.6	1	2	1	9	0.1	93	5	...	1137	1224	513	19	10	12	1	0	0	0

	battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	...	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	price_range
count	2000.000000	2000.0000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	...	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000
mean	1238.518500	0.4950	1.522250	0.509500	4.309500	0.521500	32.046500	0.501750	140.249000	4.520500	...	645.108000	1251.515500	2124.213000	12.306500	5.767000	11.011000	0.761500	0.503000	0.507000	1.500000
std	439.418206	0.5001	0.816004	0.500035	4.341444	0.499662	18.145715	0.288416	35.399655	2.287837	...	443.780811	432.199447	1084.732044	4.213245	4.356398	5.463955	0.426273	0.500116	0.500076	1.118314
min	501.000000	0.0000	0.500000	0.000000	0.000000	0.000000	2.000000	0.100000	80.000000	1.000000	...	0.000000	500.000000	256.000000	5.000000	0.000000	2.000000	0.000000	0.000000	0.000000	0.000000
25%	851.750000	0.0000	0.700000	0.000000	1.000000	0.000000	16.000000	0.200000	109.000000	3.000000	...	282.750000	874.750000	1207.500000	9.000000	2.000000	6.000000	1.000000	0.000000	0.000000	0.750000
50%	1226.000000	0.0000	1.500000	1.000000	3.000000	1.000000	32.000000	0.500000	141.000000	4.000000	...	564.000000	1247.000000	2146.500000	12.000000	5.000000	11.000000	1.000000	1.000000	1.000000	1.500000
75%	1615.250000	1.0000	2.200000	1.000000	7.000000	1.000000	48.000000	0.800000	170.000000	7.000000	...	947.250000	1633.000000	3064.500000	16.000000	9.000000	16.000000	1.000000	1.000000	1.000000	2.250000
max	1998.000000	1.0000	3.000000	1.000000	19.000000	1.000000	64.000000	1.000000	200.000000	8.000000	...	1960.000000	1998.000000	3998.000000	19.000000	18.000000	20.000000	1.000000	1.000000	1.000000	3.000000

INTRODUCTION¶

visualiser la distribution de la ram en fonction de différentes catégories prix.¶

SEPARATION DES DONNEES¶

Encodage de la variable cible avec LabelEncoder :¶

Encodage des variables catégorielles avec OneHotEncoder :¶

MODELE DE REGRESSION LOGISTIQUE¶

methode de random forest¶

par la methode de naive bayes¶

PAR LA METHODE DES KNN¶

PAR LA METHODE D'ARBRE DE DECISION¶

PAR LA METHODE SVM¶