Data Science
How to Analyze A/B Test Results....
In [1]:
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
#We are setting the seed to assure you get the same answers on quizzes as we set up
random.seed(42)
In [2]:
df = pd.read_csv('ab_data.csv')
df.head()
Out[2]:
In [3]:
df.shape
Out[3]:
In [4]:
df.nunique()
Out[4]:
In [5]:
df.converted.mean()*100
Out[5]:
In [6]:
df.query('landing_page != "new_page" & group == "treatment"').shape
Out[6]:
In [7]:
df.isnull().any(axis=1).sum()
Out[7]:
In [8]:
# Remove the inaccurate rows, and store the result in a new dataframe df2
df2 = df.query('landing_page == "new_page" & group == "treatment"')
df2 = df2.append(df.query('landing_page == "old_page" & group == "control"')
,ignore_index=True)
df2.head()
Out[8]:
In [9]:
df2.tail()
Out[9]:
In [10]:
# Double Check all of the incorrect rows were removed from df2 -
# Output of the statement below should be 0
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]
Out[10]:
In [11]:
df2.nunique()
Out[11]:
In [12]:
df2[df2.user_id.duplicated()].user_id
Out[12]:
In [13]:
df2[df2.user_id.duplicated()]
Out[13]:
In [14]:
# Remove one of the rows with a duplicate user_id..
# Hint: The dataframe.drop_duplicates() may not work in this case because the rows with duplicate user_id are not entirely identical.
df2.drop(index = 1404 , inplace = True)
# Check again if the row with a duplicate user_id is deleted or not
df2[df2.user_id.duplicated()]
Out[14]:
In [15]:
df2.converted.sum() / df2.shape[0]
Out[15]:
In [16]:
df_c = df2.query('group == "control"')
df_c.converted.sum() / df_c.shape[0]
Out[16]:
In [17]:
df_t = df2.query('group == "treatment"')
df_t.converted.sum() / df_t.shape[0]
Out[17]:
In [18]:
# Calculate the actual difference (obs_diff) between the conversion rates for the two groups.
obs_diff = df_t.converted.sum() / df_t.shape[0] - df_c.converted.sum() / df_c.shape[0]
obs_diff
Out[18]:
In [19]:
df2.query('landing_page == "new_page"').shape[0] / df2.shape[0]
Out[19]:
In [20]:
df2.converted.mean()
Out[20]:
In [21]:
df2.converted.mean()
Out[21]:
In [22]:
df_t.shape[0]
Out[22]:
In [23]:
df_c.shape[0]
Out[23]:
In [24]:
# Simulate a Sample for the treatment Group
new_page_converted = df_t.sample(df2.shape[0], replace=True).converted
In [25]:
# Simulate a Sample for the control Group
old_page_converted = df_c.sample(df2.shape[0], replace=True).converted
In [26]:
new_page_converted.mean() - old_page_converted.mean()
Out[26]:
In [27]:
# Sampling distribution
p_diffs = []
size = df2.shape[0]
for _ in range(10000):
samp = df2.sample(size, replace=True)
new_page_converted = samp.query('group == "treatment"').converted.mean()
old_page_converted = samp.query('group == "control"').converted.mean()
p_diffs.append(new_page_converted - old_page_converted)
In [38]:
(p_diffs).mean()
Out[38]:
In [33]:
obs_diff
Out[33]:
In [34]:
# Convert to numpy array
p_diffs = np.array(p_diffs)
# Plot sampling distribution
plt.hist(p_diffs);
In [35]:
# create distribution under the null hypothesis
null_vals = np.random.normal(0, p_diffs.std(), p_diffs.size)
plt.hist(null_vals);
In [36]:
# Plot observed statistic with the null distibution
plt.hist(null_vals);
plt.axvline(obs_diff, c='red')
Out[36]:
In [37]:
# p-value
(null_vals > obs_diff).mean()
Out[37]:
In [39]:
import statsmodels.api as sm
# number of conversions with the old_page
convert_old = df_c.converted.sum()
# number of conversions with the new_page
convert_new = df_t.converted.sum()
# number of individuals who were shown the old_page
n_old = df_c.shape[0]
# number of individuals who received new_page
n_new = df_t.shape[0]
In [40]:
import statsmodels.api as sm
# ToDo: Complete the sm.stats.proportions_ztest() method arguments
z_score, p_value = sm.stats.proportions_ztest([convert_new,convert_old], [n_new,n_old], alternative='larger')
print(z_score, p_value)
In [41]:
df2['intercept'] = 1
df2[['ab_page' , 'old_page']] = pd.get_dummies(df2['landing_page'])
df2.head()
Out[41]:
In [42]:
log_mod = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])
results = log_mod.fit()
In [43]:
results.summary2()
Out[43]:
In [44]:
# Read the countries.csv
df_3 = pd.read_csv('countries.csv')
df_3.head()
Out[44]:
In [45]:
# Join with the df2 dataframe
df_m = df2.merge(df_3, on ='user_id' , how='inner')
df_m.nunique()
Out[45]:
In [46]:
# Create the necessary dummy variables
df_m[['CA','UK','US']]=pd.get_dummies(df_m['country'])
df_m.head()
Out[46]:
In [47]:
# Fit your model, and summarize the results
log_mod = sm.Logit(df_m['converted'], df_m[['intercept', 'ab_page' , 'UK' , 'US']])
results = log_mod.fit()
results.summary2()
Out[47]:
In [48]:
np.exp(-0.0149)*100,np.exp(0.0506)*100,np.exp(0.0408)*100
Out[48]:
In [49]:
# Adding Higher order terms
df_m['UK_new'] = df_m['UK'] * df_m['ab_page']
df_m['US_new'] = df_m['US'] * df_m['ab_page']
df_m.head()
Out[49]:
In [50]:
# Fit the model, and summarize the results
log_mod = sm.Logit(df_m['converted'], df_m[['intercept', 'ab_page' , 'UK' , 'US', 'UK_new', 'US_new']])
results = log_mod.fit()
results.summary2()
Out[50]:
In [51]:
np.exp(-0.0674)*100 , np.exp(0.0118)*100 , np.exp(0.0175)*100 , np.exp(0.0783)*100 , np.exp(0.0469)*100
Out[51]:
- Andrew Samy
- Mar, 27 2022
Add New Comments
Please login in order to make a comment.