DEV Community

SavvyShivam
SavvyShivam

Posted on

ML Pie chart

import boto3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import tempfile
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Define S3 details
bucket_name = 'employee-data'
file_key = 'inputfiles/employee_cleaned_data.csv'

# Load dataset from S3
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
df = pd.read_csv(obj['Body'])

# Data preprocessing
df = df.drop(columns=['employee_id'])
df['region'] = df['region'].str.extract('(\d+)').astype(int)
X = df.drop(columns=['turnover'])
y = df['turnover']

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'salary']),
        ('cat', OneHotEncoder(), ['department', 'gender', 'education'])
    ],
    remainder='passthrough'
)
X_transformed = preprocessor.fit_transform(X)

# Feature selection
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X_transformed, y)
selected_features = selector.get_support(indices=True)
feature_names = preprocessor.get_feature_names_out()
selected_feature_names = feature_names[selected_features]
print(f'Selected features: {list(selected_feature_names)}')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=0)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model training
model = LogisticRegression(random_state=0)
model.fit(X_train_scaled, y_train)

# Serialize the model and upload to S3
with tempfile.TemporaryFile() as temp_model_file:
    joblib.dump(model, temp_model_file)
    temp_model_file.seek(0)
    s3_client.upload_fileobj(temp_model_file, bucket_name, 'ml-output/model.pkl')
print('Successfully pushed data to S3: model.pkl')

# Download the model from S3
with tempfile.TemporaryFile() as temp_model_file:
    s3_client.download_fileobj(bucket_name, 'ml-output/model.pkl', temp_model_file)
    temp_model_file.seek(0)
    loaded_model = joblib.load(temp_model_file)
print('Successfully loaded model from S3')

# Predictions and evaluation
y_pred_loaded = loaded_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_loaded)
precision = precision_score(y_test, y_pred_loaded)
recall = recall_score(y_test, y_pred_loaded)
f1 = f1_score(y_test, y_pred_loaded)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred_loaded)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Turnover', 'Turnover'], yticklabels=['No Turnover', 'Turnover'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

Enter fullscreen mode Exit fullscreen mode

AWS GenAI LIVE image

Real challenges. Real solutions. Real talk.

From technical discussions to philosophical debates, AWS and AWS Partners examine the impact and evolution of gen AI.

Learn more

Top comments (0)

Billboard image

Create up to 10 Postgres Databases on Neon's free plan.

If you're starting a new project, Neon has got your databases covered. No credit cards. No trials. No getting in your way.

Try Neon for Free →

👋 Kindness is contagious

Please leave a ❤️ or a friendly comment on this post if you found it helpful!

Okay