Train a logistic regression Model on titanic data set, save the trained model. Load the saved model and predict on test data

This tutorial explains the steps to create customs blocks to load training data from project space, train a logistic regression model which is part of the system provided blocks.

Prerequisite

  1. Create a folder titanic in project-space and upload the titanic training and test data as train.csv and test.csv

1. Import required packages from SDK

from razor.marketplace.blocks.rzt.ML_Blocks import LogisticRegression
from razor.api import project_space_path
import razor.flow as rf

2. Define a custom block to read the training data from project space

import pandas as pd
@rf.block
class CsvReader:
    filename: str
    output:rf.SeriesOutput[pd.DataFrame]
    def run(self):
        file_path = project_space_path(self.filename)
        chunks = pd.read_csv(file_path, chunksize=10, nrows=None, delimiter = None)
        for df in chunks:
            self.output.put(df)

3. Define a custom block to remove rows with Nan

@rf.block
class DfFilterNan():
    df_chunks:rf.SeriesInput[pd.DataFrame]
    output: rf.Output[pd.DataFrame]
    def run(self):
        concat_df = pd.DataFrame()
        for df in self.df_chunks:
            df.dropna(axis=0, inplace=True)
            concat_df = pd.concat([concat_df, df])
        self.output.put(concat_df)

4. Define a custom block to convert categorical fields to numeric

@rf.block
class DfCategorical():
    columns:list
    df:pd.DataFrame
    output:rf.Output[pd.DataFrame]
    def run(self):
        for col in self.columns:
            self.df[col] = self.df[col].astype('category')
            self.df[col] = self.df[col].cat.codes
        self.output.put(self.df)

5. Define a custom blocks to generate two numpy arrays for input and target based on the column names provided

import numpy as np
@rf.block
class Get_data():
    x_columns : list
    y_column:list
    df:pd.DataFrame
    out_x:rf.Output[np.ndarray]
    out_y:rf.Output[np.ndarray]
    
    def run(self):
        if self.y_column is not None and len(self.y_column)!=0:
            x = self.df[self.x_columns].values
            y = np.squeeze(self.df[self.y_column].values)
            self.out_x.put(x)
            self.out_y.put(y)
        else:
            x = self.df[self.x_columns].values
            self.out_x.put(x)

6. Build and display the pipeline

atomic_csv_reader = CsvReader(filename="titanic/train.csv")
df_filter = DfFilterNan(
                         df_chunks =atomic_csv_reader.output
                       )
df_cat = DfCategorical( columns =["Sex", "Cabin", "Embarked"],
                       df = df_filter.output
                      )
train_data = Get_data(x_columns =['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Cabin', 'Embarked'],
                      y_column = ['Survived'],
                      df = df_cat.output
                     )
lr_model_train = LogisticRegression(
                                    operation="fit",
                                    x_data = train_data.out_x,
                                    y_data = train_data.out_y,
                                    path = "lr_m1.sav",
                                    save = True
                                   )
pipeline = rf.Pipeline(targets=[lr_model_train])
pipeline.show()

svg

7. Run the pipeline

pipeline.execute()
<razor_tools.backend.ipython.mime.run_monitor.RunMonitor at 0x7f2976d9bb90>





<Process(Pipeline Manager('Pipeline_17'), stopped)>
import pandas as pd
import numpy as np
@rf.block
class NumpyToCsv():
    numpy_array:np.ndarray
    output_path:str
    def run(self):
        pd.DataFrame(self.numpy_array,columns=['Predictions']).to_csv(project_space_path(self.output_path))
        
lr_model_predict = LogisticRegression(operation = "predict",
                                      x_data = train_data.out_x,
                                      attribute = "classes_",
                                      path="lr_m1.sav",
                                      load = True
                                     )
csv_writer = NumpyToCsv(output_path = "lr_pred_1.csv",
                        numpy_array = lr_model_predict.predictions)
predict_pipeline = rf.Pipeline(targets=[csv_writer])
predict_pipeline.show()

svg

predict_pipeline.execute()
<razor_tools.backend.ipython.mime.run_monitor.RunMonitor at 0x7f2974167e90>





<Process(Pipeline Manager('Pipeline_24'), stopped)>