Notice: This page requires JavaScript to function properly.
Please enable JavaScript in your browser settings or update your browser.
Вивчайте Metrics | U-Test
The Art of A/B Testing

book
Metrics

So, we have pairwise compared both datasets' columns. Let's recall Section 1. We need a metric, or better yet, multiple metrics. Good metrics for our datasets would be:

Let's compare the first metric, Conversion Rate, for both datasets. We will plot histograms:

# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Read .csv files
df_control = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_control.csv', delimiter=';')
df_test = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_test.csv', delimiter=';')

# Define metric
df_test['Conversion'] = df_test['Purchase'] / df_test['Click']
df_control['Conversion'] = df_control['Purchase'] / df_control['Click']

# Ploting hists
sns.histplot(df_control['Conversion'], color="#1e2635", label="Conversion of Control Group")
sns.histplot(df_test['Conversion'], color="#ff8a00", label="Conversion of Test Group")

# Add mean line
plt.axvline(df_control['Conversion'].mean(), color="#1e2635", linestyle='dashed', linewidth=1, label='Mean Control Group')
plt.axvline(df_test['Conversion'].mean(), color="#ff8a00", linestyle='dashed', linewidth=1, label='Mean Test Group')

# Sign the axes
plt.xlabel('Conversion')
plt.ylabel('Frequency')
plt.legend()
plt.title('Histogram of Conversion')

# Show the results
plt.show()
1234567891011121314151617181920212223242526272829
# Import libraries import matplotlib.pyplot as plt import pandas as pd import seaborn as sns # Read .csv files df_control = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_control.csv', delimiter=';') df_test = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_test.csv', delimiter=';') # Define metric df_test['Conversion'] = df_test['Purchase'] / df_test['Click'] df_control['Conversion'] = df_control['Purchase'] / df_control['Click'] # Ploting hists sns.histplot(df_control['Conversion'], color="#1e2635", label="Conversion of Control Group") sns.histplot(df_test['Conversion'], color="#ff8a00", label="Conversion of Test Group") # Add mean line plt.axvline(df_control['Conversion'].mean(), color="#1e2635", linestyle='dashed', linewidth=1, label='Mean Control Group') plt.axvline(df_test['Conversion'].mean(), color="#ff8a00", linestyle='dashed', linewidth=1, label='Mean Test Group') # Sign the axes plt.xlabel('Conversion') plt.ylabel('Frequency') plt.legend() plt.title('Histogram of Conversion') # Show the results plt.show()
copy

Well, it doesn't seem to follow a normal distribution. Let's plot a box plot:

#Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

#Read .csv files
df_control = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_control.csv', delimiter=';')
df_test = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_test.csv', delimiter=';')

#Define metric
df_test['Conversion'] = df_test['Purchase'] / df_test['Click']
df_control['Conversion'] = df_control['Purchase'] / df_control['Click']

#We add to the dataframes columns-labels, which mean belonging to either the control or the test group
df_control['group'] = 'Contol group'
df_test['group'] = 'Test group'

#Concat the dataframes and plotting boxplots
df_combined = pd.concat([df_control, df_test])
sns.boxplot(data=df_combined, x='group', y='Conversion', palette=['#1e2635', '#ff8a00'],
medianprops={'color': 'red'})

#Sign the axis
plt.xlabel('')
plt.ylabel('Conversion')
plt.title('Comparison of Conversion')

#Show the results
plt.show()
1234567891011121314151617181920212223242526272829
#Import libraries import matplotlib.pyplot as plt import pandas as pd import seaborn as sns #Read .csv files df_control = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_control.csv', delimiter=';') df_test = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_test.csv', delimiter=';') #Define metric df_test['Conversion'] = df_test['Purchase'] / df_test['Click'] df_control['Conversion'] = df_control['Purchase'] / df_control['Click'] #We add to the dataframes columns-labels, which mean belonging to either the control or the test group df_control['group'] = 'Contol group' df_test['group'] = 'Test group' #Concat the dataframes and plotting boxplots df_combined = pd.concat([df_control, df_test]) sns.boxplot(data=df_combined, x='group', y='Conversion', palette=['#1e2635', '#ff8a00'], medianprops={'color': 'red'}) #Sign the axis plt.xlabel('') plt.ylabel('Conversion') plt.title('Comparison of Conversion') #Show the results plt.show()
copy

The distributions are heavily skewed, suggesting they are unlikely to be normal. Let's confirm this by performing the Shapiro-Wilk test:

# Import libraries
import pandas as pd
from scipy.stats import shapiro

# Read .csv files
df_control = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_control.csv', delimiter=';')
df_test = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_test.csv', delimiter=';')

# Define metric
df_test['Conversion'] = df_test['Purchase'] / df_test['Click']
df_control['Conversion'] = df_control['Purchase'] / df_control['Click']

# Testing normality
stat_control, p_control = shapiro(df_control['Conversion'])
print("Control group: ")
print("Stat: %.4f, p-value: %.4f" % (stat_control, p_control))
if p_control > 0.05:
print('Control group is likely to normal distribution')
else:
print('Control group is NOT likely to normal distribution')

stat_control, p_control = shapiro(df_test['Conversion'])
print("Test group: ")
print("Stat: %.4f, p-value: %.4f" % (stat_control, p_control))
if p_control > 0.05:
print('Control group is likely to normal distribution')
else:
print('Control group is NOT likely to normal distribution')
12345678910111213141516171819202122232425262728
# Import libraries import pandas as pd from scipy.stats import shapiro # Read .csv files df_control = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_control.csv', delimiter=';') df_test = pd.read_csv('https://codefinity-content-media.s3.eu-west-1.amazonaws.com/c3b98ad3-420d-403f-908d-6ab8facc3e28/ab_test.csv', delimiter=';') # Define metric df_test['Conversion'] = df_test['Purchase'] / df_test['Click'] df_control['Conversion'] = df_control['Purchase'] / df_control['Click'] # Testing normality stat_control, p_control = shapiro(df_control['Conversion']) print("Control group: ") print("Stat: %.4f, p-value: %.4f" % (stat_control, p_control)) if p_control > 0.05: print('Control group is likely to normal distribution') else: print('Control group is NOT likely to normal distribution') stat_control, p_control = shapiro(df_test['Conversion']) print("Test group: ") print("Stat: %.4f, p-value: %.4f" % (stat_control, p_control)) if p_control > 0.05: print('Control group is likely to normal distribution') else: print('Control group is NOT likely to normal distribution')
copy

The Shapiro-Wilk test did not provide sufficient statistical evidence for the normality of the Conversion metric distributions. However, this does not hinder us. Even in such a situation, we can turn to the non-parametric Mann-Whitney U-test, also known as the U-test.

Все було зрозуміло?

Як ми можемо покращити це?

Дякуємо за ваш відгук!

Секція 5. Розділ 1
some-alt