DEV Community

SavvyShivam
SavvyShivam

Posted on

sagemaker pie

Import Required Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import tempfile
import boto3
Enter fullscreen mode Exit fullscreen mode
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
Enter fullscreen mode Exit fullscreen mode
Define Constants
BUCKET_NAME = "employee-data"
S3_INPUT_FOLDER = "inputfiles"
S3_OUTPUT_FOLDER = "ml-output"
FILE_NAME = "employee_cleaned_data.csv"

AWS S3 Initialization
s3_client = boto3.client('s3')
Enter fullscreen mode Exit fullscreen mode

Task 1: Load Data from S3

try:
     Define S3 file path
    s3_file_key = f"{S3_INPUT_FOLDER}/{FILE_NAME}"

    Use tempfile to download the file locally
    with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as temp_file:
        temp_file_path = temp_file.name
        s3_client.download_file(BUCKET_NAME, s3_file_key, temp_file_path)
        print(f"File downloaded successfully from s3://{BUCKET_NAME}/{s3_file_key}")

    Load the dataset into a Pandas DataFrame
    df = pd.read_csv(temp_file_path)
    print("Data loaded successfully!")
except Exception as e:
    print("Error loading data from S3:", e)
Enter fullscreen mode Exit fullscreen mode

Task 2: Preprocess Data
Remove unique identifier column

if "employee_operations_id" in df.columns:
    df = df.drop(columns=["employee_operations_id"])

Extract numeric values from the 'region' column
if "region" in df.columns:
    df['region'] = df['region'].str.extract('(\d+)').astype(float)
Enter fullscreen mode Exit fullscreen mode

Task 3: Analyze and Visualize Data
Remove duplicates

duplicate_count = df.duplicated().sum()
print(f"Number of duplicate records: {duplicate_count}")
df = df.drop_duplicates()

Pie chart for Gender Distribution
if 'gender' in df.columns:
    gender_counts = df['gender'].value_counts()
    plt.figure(figsize=(8, 6))
    plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99'])
    plt.title('Gender Distribution')
    plt.show()
Enter fullscreen mode Exit fullscreen mode

Count plot for Education by Gender

if 'education' in df.columns and 'gender' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='education', hue='gender', palette='Set2')
    plt.title('Education Level Distribution by Gender')
    plt.xlabel('Education Level')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()
Enter fullscreen mode Exit fullscreen mode

Task 4: Feature Engineering
Define dependent and independent variables

dependent_variable = "turnover"  # Replace with actual column name
if dependent_variable in df.columns:
    X = df.drop(columns=[dependent_variable])
    Y = df[dependent_variable]

# Preprocess with ColumnTransformer
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', 'passthrough', numerical_columns)
    ]
)

X_transformed = column_transformer.fit_transform(X)

Feature selection
selector = SelectKBest(score_func=f_regression, k=5)
X_selected = selector.fit_transform(X_transformed, Y)
Enter fullscreen mode Exit fullscreen mode

Task 5: Model Training and Evaluation

Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=0)
Enter fullscreen mode Exit fullscreen mode

Feature scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Enter fullscreen mode Exit fullscreen mode

Train Logistic Regression model

model = LogisticRegression(random_state=0)
model.fit(X_train_scaled, y_train)
Enter fullscreen mode Exit fullscreen mode

Predictions and evaluation

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Confusion matrix heatmap
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Confusion Matrix')
plt.show()
Enter fullscreen mode Exit fullscreen mode

Task 6: Deploy Model

with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as temp_file:
    model_file_path = temp_file.name
    joblib.dump(model, model_file_path)

s3_file_key = f"{S3_OUTPUT_FOLDER}/logistic_regression_model.pkl"
try:
    s3_client.upload_file(model_file_path, BUCKET_NAME, s3_file_key)
    print(f"Model uploaded successfully to s3://{BUCKET_NAME}/{s3_file_key}")
except Exception as e:
    print("Error uploading the model:", e)
Enter fullscreen mode Exit fullscreen mode

Task 7: Prediction Using Deployed Model

try:
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as temp_file:
        model_file_path = temp_file.name
        s3_client.download_file(BUCKET_NAME, s3_file_key, model_file_path)
    model = joblib.load(model_file_path)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Prediction Accuracy: {accuracy:.2f}")
except Exception as e:
    print("Error during prediction:", e)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)