DEV Community

Krzysztof Bruszewski
Krzysztof Bruszewski

Posted on

Tensorflow misclassification problem

Hi folks.

I would like to build a tensorflow model to categorize loans for credit risk.
Unfortunately, I have a problem cause my model puts all cases into one class.
I don't know if it comes from the definition of metadata in specify_feature_usages function.
Maybe this part should be done in another way? But how to do that?
Does this come from imbalanced training data? 1.113 for risk cases and 2.220 cases for others.

I would appreciate for all advices.

Data source used in the project credit_data.h5.

Image description

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

import pprint as pp
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import DenseFeatures, Dense, Activation
import tensorflow as tf
from tensorflow import keras
from IPython.display import display
import matplotlib.pyplot as plt
sns.set(style='darkgrid')
print('TensorFlow version: ' + tf.__version__)

# credit_data = pd.read_csv('german_credit_data.csv')
# credit_data.to_hdf('credit_data.h5', key='df', mode='w')
credit_data = pd.read_hdf('credit_data.h5', 'df')  
display(credit_data)

data = credit_data.copy()
# Name of the label column.
label = 'Risk'
id_column = 'CustomerID'
data[label].value_counts().to_frame().T

data.info()

train = 2/3
test = 1 - train
label_lst = [label]
train_data, test_data = train_test_split(data, test_size=test, random_state=0, stratify=data[label_lst])
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
train_data[label].value_counts().to_frame().T

label_new = label + '_target'
train_data[label_new] = train_data[label].apply(lambda x: 1 if x == 'Risk' else 0)
train_data.drop(label, axis=1, inplace=True)

test_data[label_new] = test_data[label].apply(lambda x: 1 if x == 'Risk' else 0)
test_data.drop(label, axis=1, inplace=True)

print("Training dataset")
display(train_data.agg({label_new : ['sum', 'count']}))
print("Test dataset")
display(test_data.agg({label_new : ['sum', 'count']}))

label = label_new
id_column = 'CustomerID'

### Define continuous list
numericFeatures  = ['LoanDuration','LoanAmount','InstallmentPercent','CurrentResidenceDuration','Age','ExistingCreditsCount','Dependents']
### Define the categorical list
objectFeatures = ['CheckingStatus','CreditHistory','LoanPurpose','ExistingSavings','EmploymentDuration','Sex',
'OthersOnLoan','OwnsProperty','InstallmentPlans','Housing','Job','Telephone','ForeignWorker']

# Target column name.
TARGET_COLUMN_NAME = label
# Numeric feature names.
NUMERIC_FEATURE_NAMES = numericFeatures
# Categorical features and their vocabulary lists.
CATEGORICAL_FEATURE_NAMES = objectFeatures

print(TARGET_COLUMN_NAME)
print(NUMERIC_FEATURE_NAMES)
print(CATEGORICAL_FEATURE_NAMES)

def specify_feature_usages(df, label):
    feature_usages = list()
    feature_names = list()

    for feature_name in NUMERIC_FEATURE_NAMES:

        mean = df[feature_name].mean()
        std = df[feature_name].std()

        def zscore(x):
            x = tf.dtypes.cast(x, tf.float32)
            return (x - mean)/std

        feature_usage = tf.feature_column.numeric_column(key=feature_name, normalizer_fn=zscore)
        feature_usages.append(feature_usage)
        feature_names.append(feature_name)

    for feature_name in CATEGORICAL_FEATURE_NAMES:

        aggregate = df.groupby(feature_name)[label].agg(['sum','count'])
        aggregate['share'] = aggregate['sum'] / aggregate['count']
        aggregate.sort_values('share', ascending=False, inplace=True)
        vocabulary = aggregate.index.values.tolist()

        feature_usage = tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
            key=feature_name, vocabulary_list=vocabulary, default_value=0
        ))
        feature_usages.append(feature_usage)
        feature_names.append(feature_name)

    return feature_usages, feature_names


feature_columns, feature_names = specify_feature_usages(train_data, label)
l_inputs = len(feature_names)
l_outputs = len([label])
print('inputs={inputs}, output={output}'.format(inputs=l_inputs,output=l_outputs))
pp.pprint(feature_columns)

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, target_cols, shuffle=True):
    dataframe = dataframe.copy()
    total_rows = dataframe.shape[0]
    batch_size = int(total_rows/10)
    labels = dataframe[target_cols]
    features = dataframe.drop(target_cols, axis=1)
    ds = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(features))
        ds = ds.batch(batch_size)
    return ds


# Convert the dataset into a TensorFlow dataset.
train_dataset = df_to_dataset(train_data, label, True)
test_dataset = df_to_dataset(test_data, label, True)

model = Sequential()
model.add(DenseFeatures(feature_columns))
model.add(Dense(24, activation=tf.nn.sigmoid))
model.add(Dense(12, activation=tf.nn.sigmoid))
model.add(Dense(6, activation=tf.nn.sigmoid))
model.add(Dense(l_outputs, activation=tf.nn.softmax))

METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc')
]

model.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=METRICS)



model.fit(train_dataset, verbose=0)

# evaluate the model
print("Training dataset")
scores = model.evaluate(train_dataset, verbose=0)
scores_metrics = zip(model.metrics_names, scores)
for m, s in list(scores_metrics):
    print(m, s)

print("Test dataset")
scores = model.evaluate(test_dataset, verbose=0)
scores_metrics = zip(model.metrics_names, scores)
for m, s in list(scores_metrics):
    print(m, s)


y_predicted = model.predict(test_dataset)

df_hist = pd.DataFrame(y_predicted)
df_hist.columns = ["Risk_prediction"]
sns.histplot(data=df_hist, x="Risk_prediction", bins=10)
plt.show()

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_actual = test_data[label]
matrix = confusion_matrix(y_actual, y_predicted, labels=[1,0])
report = classification_report(y_actual, y_predicted)
print(matrix)
print(report)
Enter fullscreen mode Exit fullscreen mode

Top comments (0)