Copyright 2012 @ Canopus Business Management Group. All Rights Reserved This case study is created by Canopus Business Management Group for indiviual use and learning purpose. Not be reproduced for any other purpose. Refer to www.collaborat.com for terms of use
Author : Nilakantasrinivasan (Neil) neil@collaborat.com
# Importing basic packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes =True)
%matplotlib inline
# Importing the data file for the Clear Calls Case Study
CS=pd.read_excel("Case Study Data XY for python GB.xls", index_col=0)
# Descriptive Statistics of the data
CS.describe().transpose()
# Checking for null values
CS.isnull().values.any()
# Identifying rows with NAN values
CS[pd.isnull(CS).any(axis=1)]
# Dropping rows with NAN Values
CS=CS.dropna(axis=0)
CS.describe()
# Locating rows which have zero values
CS.loc[(CS==0).any(axis=1)]
# Drop 7th row with zero values
CS.drop(index=7, axis=0)
CS['Type of Technology (Modem, routers, etc)'].unique()
CS['Service Affiliate CODE'].unique()
CS['Service Affiliate TEAMS'].unique()
CS.loc[CS['Type of Technology (Modem, routers, etc)'] == 'd', 'Type of Technology (Modem, routers, etc)'] = 'D'
CS['Type of Technology (Modem, routers, etc)'].unique()
CS.loc[CS['Service Affiliate CODE'] == 'a', 'Service Affiliate CODE'] = 'A'
CS['Service Affiliate CODE'].unique()
CS.loc[CS['Service Affiliate TEAMS'] == 'a2', 'Service Affiliate TEAMS'] = 'A2'
CS['Service Affiliate TEAMS'].unique()
# Group the dataframe into two based on Defects and Non Defects (For defectives here)
CSD=CS[CS['Defects / Non Defects']=='D']
CSD
# Same as above ( for Non Defectives)
CSN=CS[CS['Defects / Non Defects']=='N']
CSN
# Histogram Ploting for Installation data
plt.title("Histogram for Installation TAT in Hours")
sns.distplot(CS['Installation TAT in Hours'], kde=False)
plt.figure(figsize=(20, 6))
plt.subplot(1,3,1)
sns.set_style("dark")
plt.title("Histogram for Commission to SA")
sns.distplot(CS['Commission to SA'],kde=False)
plt.subplot(1,3,2)
sns.distplot(CSN['Commission to SA'],kde=False,color="Blue", label="Commission for Non Defects")
sns.distplot(CSD['Commission to SA'],kde=False,color = "Gold", label = "Commission for Defects")
plt.title("Histograms for Commission to SA")
plt.legend()
plt.subplot(1,3,3)
sns.boxplot(x=CS['Defects / Non Defects'],y=CS['Commission to SA'])
plt.title("Segmented Boxplot for Commission")
plt.figure(figsize=(20, 6))
plt.subplot(1,3,1)
sns.set_style("dark")
plt.title("Histogram for Items in Queue")
sns.distplot(CS['No. of installations requests in queue'],kde=False)
plt.subplot(1,3,2)
sns.distplot(CSN['No. of installations requests in queue'],kde=False,color="Blue", label="Items in Queue for Non Defects")
sns.distplot(CSD['No. of installations requests in queue'],kde=False,color = "Gold", label = "Items in Queue for Defects")
plt.title("Histograms for Items in Queue")
plt.legend()
plt.subplot(1,3,3)
sns.boxplot(x=CS['Defects / Non Defects'],y=CS['No. of installations requests in queue'])
plt.title("Segmented Boxplot for Items in Queue")
plt.figure(figsize=(20, 6))
plt.subplot(1,3,1)
sns.set_style("dark")
plt.title("Histogram for Processing Time@ Back Office")
sns.distplot(CS['Prcessing Time@ Back Office'],kde=False)
plt.subplot(1,3,2)
sns.distplot(CSN['Prcessing Time@ Back Office'],kde=False,color="Blue", label="Prcessing Time@ Back Office for Non Defects")
sns.distplot(CSD['Prcessing Time@ Back Office'],kde=False,color = "Gold", label = "Prcessing Time@ Back Office for Defects")
plt.title("Histograms for Prcessing Time@ Back Office")
plt.legend()
plt.subplot(1,3,3)
sns.boxplot(x=CS['Defects / Non Defects'],y=CS['Prcessing Time@ Back Office'])
plt.title("Segmented Boxplot for Prcessing Time@ Back Office")
plt.figure(figsize=(20, 6))
plt.subplot(1,3,1)
sns.set_style("dark")
plt.title("Histogram for Distance from the SA office")
sns.distplot(CS['Distance from the SA office'],kde=False)
plt.subplot(1,3,2)
sns.distplot(CSN['Distance from the SA office'],kde=False,color="Blue", label="Distance from the SA office for Non Defects")
sns.distplot(CSD['Distance from the SA office'],kde=False,color = "Gold", label = "Distance from the SA office for Defects")
plt.title("Histograms for Distance from the SA office")
plt.legend()
plt.subplot(1,3,3)
sns.boxplot(x=CS['Defects / Non Defects'],y=CS['Distance from the SA office'])
plt.title("Segmented Boxplot for Distance from the SA office")
plt.figure(figsize=(20, 6))
plt.subplot(1,3,1)
sns.set_style("dark")
plt.title("Histogram for Distance travelled")
sns.distplot(CS['Distance travelled'],kde=False)
plt.subplot(1,3,2)
sns.distplot(CSN['Distance travelled'],kde=False,color="Blue", label="Distance travelled for Non Defects")
sns.distplot(CSD['Distance travelled'],kde=False,color = "Gold", label = "Distance travelled for Defects")
plt.title("Histograms for Distance travelled")
plt.legend()
plt.subplot(1,3,3)
sns.boxplot(x=CS['Defects / Non Defects'],y=CS['Distance travelled'])
plt.title("Segmented Boxplot for Distance travelled")
# Ploting multiple boxplots and comparing the factors of two variables
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
sns.boxplot(x=CS['Service Affiliate CODE'], y=CS['Installation TAT in Hours'])
plt.subplot(1,2,2)
sns.boxplot(x=CS['Service Affiliate TEAMS'], y=CS['Installation TAT in Hours'])
# Numercial Descripion of Installation TAT
CS['Installation TAT in Hours'].describe()
from scipy import stats
stats.normaltest(CS['Installation TAT in Hours']).pvalue
from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot
qqplot(CS['Installation TAT in Hours'], line ='s')
pyplot.show()
import stemgraphic
from stemgraphic import stem_graphic
stemgraphic.stem_graphic(CS['Installation TAT in Hours'], scale = 10)
sns.distplot(CS['Installation TAT in Hours'],hist=False)
plt.axvline(x=30, color='r')
sns.pairplot(CS, vars=['Installation TAT in Hours','Commission to SA'])
sns.pairplot(CS,vars=['Installation TAT in Hours','Commission to SA','No. of installations requests in queue','Prcessing Time@ Back Office','Distance from the SA office','Distance travelled','Installation TAT in Hours'])
plt.title("Pairplot of Variables by Outcome")
cor = CS.corr(method ='pearson')
cor
sns.heatmap(cor)
# Run No.1
inputs=['No. of installations requests in queue','Distance travelled','Distance from the SA office']
X=CS[inputs]
import statsmodels.api as sm
X = sm.add_constant(X)
y= CS['Installation TAT in Hours']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()
# Run No.2
inputs1=['No. of installations requests in queue','Distance travelled']
X=CS[inputs1]
X = sm.add_constant(X)
y= CS['Installation TAT in Hours']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()
# Run No.3 (Without Y Intercept - 'C' in Y=mx+C will be not included in the model now)
inputs1=['No. of installations requests in queue','Distance travelled']
X=CS[inputs1]
y= CS['Installation TAT in Hours']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()
Each variable (discrete) will be checked whether it influences Installation TAT or not in the following tests
Data has to be stacked
from statsmodels.formula.api import ols # For n-way ANOVA
from statsmodels.stats.anova import anova_lm # For n-way ANOVA
Ho : Installation TAT for different technologies is same Ha : Installation TAT for different technologies is different - Type of Modem
plt.figure(figsize=(10, 6))
sns.boxplot(x=CS['Type of Technology (Modem, routers, etc)'], y=CS['Installation TAT in Hours'])
y= CS['Installation TAT in Hours']
X=CS['Type of Technology (Modem, routers, etc)']
formula = 'y ~ X'
model = ols(formula, CS).fit()
aov_table = anova_lm(model)
print(aov_table)
Ho : Installation TAT for different Service Affiliates is same Ha : Installation TAT for different Service Affiliates is different
plt.figure(figsize=(10, 6))
sns.boxplot(x=CS['Service Affiliate CODE'], y=CS['Installation TAT in Hours'])
SA_CODE=CS['Service Affiliate CODE']
formula = 'y ~ SA_CODE'
model = ols(formula, CS).fit()
aov_table = anova_lm(model)
print(aov_table)
plt.figure(figsize=(10, 6))
sns.boxplot(x=CS['Service Affiliate TEAMS'], y=CS['Installation TAT in Hours'])
y= CS['Installation TAT in Hours']
X=CS['Service Affiliate TEAMS']
formula = 'y ~ X'
model = ols(formula, CS).fit()
aov_table = anova_lm(model)
print(aov_table)
This test is conducted for one variable Service Affiliate Code as an example here.
# Test of Homogenity of Variance
w, pvalue = stats.levene(CS['Installation TAT in Hours'][CS['Service Affiliate CODE'] == 'A'],
CS['Installation TAT in Hours'][CS['Service Affiliate CODE'] == 'B'],
CS['Installation TAT in Hours'][CS['Service Affiliate CODE'] == 'C'])
print(w, pvalue)
Pre and Post data comparison after improvement to validate if statistical different
Prepost= pd.read_excel('F:/2019 GB Python/ClearCalls Pre-Post Data for Py .xlsx')
Prepost.head()
sns.boxplot(x='variable', y='value', data = pd.melt(Prepost), width =0.3)
from scipy import stats
from scipy.stats import ttest_1samp, ttest_ind,mannwhitneyu,levene,shapiro,wilcoxon
t_statistic, p_value = ttest_ind(Prepost['Pre_TAT'], Prepost['Post_TAT'])
print(t_statistic, p_value)