Some Pandas sample Codes

Pandas Group by sample

(deals.groupby(['VendorID']).agg({'Category':'nunique'})>1)['Category'].value_counts()

Group by with Name

df.groupby('Column0').agg(Name1=('Column1','count') ,Name2=('Column2' , 'nunique' ))

Pivot table

pd.pivot_table(df, values='D', index=['A', 'B'],columns=['C'], aggfunc=np.sum)

Put first order detail in the row.

vcat = deals.groupby(['VendorID','Category']).agg({'DealID':'nunique'}).sort_values('DealID',ascending=False).reset_index()
vcat = vcat.groupby('VendorID').agg({'Category':'first'})
vcat

vds = deals.groupby('VendorID').agg(dls = ('DealID','nunique'),
                              NumberOfVendorRating = ('NumberOfVendorRating','first'),
                              AverageVendorRating = ('AverageVendorRating','first'),
                              UniquePageView = ('UniquePageView','first'),
                              AddToCart = ('AddToCart','first')                              
                             ).reset_index()
vds

python seaborn Heatmap

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (15,8))
sns.heatmap(vendors.corr() , annot = True , cmap="Reds" , vmin = 0.7)

plt.figure(figsize = (15,8))
sns.heatmap(vendors.merge(vccat[vccat['Category'] == 'Leisure & Sports'] , on = 'VendorID').corr() , annot = True , cmap="Purples" , vmin = 0.65)

Orders, Next/First order month

Orders = df[['OrderID','VendorID','ShamsiYearMonth','CustomerID','NthCustomerOrder']].dropna().reset_index()
Orders.drop(columns = 'index' , inplace = True)
Orders.drop_duplicates(keep='first',inplace= True , ignore_index=True)

#Next Order Month
Orders.sort_values(['CustomerID','ShamsiYearMonth'] , inplace=True , ascending=False)
Orders['nextOrder'] = Orders.groupby('CustomerID')['ShamsiYearMonth'].diff()*-1
Orders.loc[(Orders.nextOrder == -0.00),'nextOrder'] = 0
Orders['nextOrder'].value_counts()

#First Order Month
Orders.set_index(keys='CustomerID', inplace=True)
# Orders['FirstOrderMonth'] = Orders.groupby('CustomerID')['ShamsiYearMonth'].min()
Orders['FirstOrderMonth'] = Orders[(Orders['NthCustomerOrder'] ==1)]['ShamsiYearMonth']
Orders.reset_index(inplace=True)

Nth customer Order

v2['Order']=v2.groupby(['NID']).Month.transform(lambda x : pd.factorize(x)[0]+1)

Rename Columns

df.rename(columns={'W':'A', 'X':'B', etc})

Pie Chart

import matplotlib.pyplot as plt

fig1, ax1 = plt.subplots()
labels = v2.index
ax1.pie(v[["cnt","Pdate"]].pivot_table(index="cnt" , aggfunc="count" ), autopct='%1.1f%%',labels=labels,
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

Retention

g = pd.DataFrame(df['Frequency'].value_counts())
g

h = pd.DataFrame(columns = ['F', 'Retention Rate'])
for i in range(291):
    h.loc[i] = [i + 1, (g['Frequency'].iloc[i+1:].sum()/(g['Frequency'].iloc[i:].sum()))]


h.plot(x = 'F', y = 'Retention Rate', figsize=[16,8], legend=False);
plt.xlim(0,60)
# plt.xticks([1,2,3,4,5,6,7,8,9,10,15,20,30,40,50,60]);
# plt.yticks([0,0.20,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1.0]);
plt.xlabel('Order Number')
plt.ylabel('Retention Rate')


h.plot(x = 'F', y = 'Retention Rate', figsize=[16,8], legend=False);
plt.xlim(0,60)
plt.xticks([1,2,3,4,5,6,7,8,9,10,20,30,40,50,60]);
plt.yticks([0,0.20,0.40,0.50,0.56,0.60,0.70,0.80,0.90,1.0]);
plt.xlabel('Order Number')
plt.ylabel('Retention Rate')
plt.axvline(x = 2, linewidth = 2, color = 'grey')
plt.axhline(y = 0.56, linewidth = 2, color = 'grey')
plt.axvline(x = 5, linewidth = 2, color = 'cyan')
plt.axhline(y = 0.71, linewidth = 2, color = 'cyan')
plt.axvline(x = 9, linewidth = 2, color = 'green')
plt.axhline(y = 0.8, linewidth = 2, color = 'green')

Group by with defined Lambda functions

PRESENT = dt.datetime(2020,2,5)
f_recency = lambda date: (PRESENT - date.max()).days
f_length = lambda date: (date.max() - date.min()).days
f_unique = lambda value: value.nunique()
f_second_date = lambda date: (date.nsmallest(2).iloc[-1] - date.min()).days

rfm = filtered_orders.groupby('customer_id').agg({'price': [('Monetary','sum'), ('Monetary_Mean', 'mean'), ('Monetary Median', 'median')], 
                                                   'date': [('Recency', f_recency), ('Length', f_length), ('Second_Order_Time', f_second_date)], 
                                                   'order_id': [('Frequency', 'count')]
                                                     })

rfm.columns = rfm.columns.get_level_values(1)
rfm.reset_index(inplace=True)
rfm['Second_Order_Time'] = (-1)*rfm['Second_Order_Time']

Save To Excel


from datetime import datetime

stri = 'The related Name' + datetime.now().strftime("-%Y%m%d-%H%M%S") + '.xlsx'
writer = pd.ExcelWriter(stri)

pstg.to_excel(writer , sheet_name = 'Percetage')
writer.save()

Categorize the Range

df[‘PriceBin’] = pd.cut(df[‘PriceAvg’], bins = 3)
df[‘PriceBin’].value_counts()

(54060.0, 2040000.0]      209 
(2040000.0, 4020000.0]      4 
(4020000.0, 6000000.0]      1 
Name: PriceBin, dtype: int64


df[‘PriceBin’] = pd.qcut(df[‘PriceAvg’], q=3)
df[‘PriceBin’].value_counts().sort_index()

(59999.999, 210000.0]    77
(210000.0, 315000.0]     66
(315000.0, 6000000.0]    71
Name: PriceBin, dtype: int64
h = df.groupby(‘PriceBin’, as_index=False).median()[‘SalesAvg’]

h = pd.DataFrame(h)
h.reset_index(inplace=True)
h


PriceBin SalesAvg
0(59999.999, 210000.0] 42.0000001
(210000.0, 315000.0] 145.1666672
(315000.0, 6000000.0] 114.200000

Customer Order Number

v2['Order']=v2.groupby(['NID']).Month.transform(lambda x : pd.factorize(x)[0]+1)

total row

m = df.groupby(['Level','Company','Item'])['Value'].sum().unstack(level=['Company','Item'])
m = m.assign(total=m.sum(1))
m = m.stack(level='Company')
m = m.assign(total=m.sum(1))
m = m.stack(level='Item')

IF condition

#df.loc[df['column name'] condition, 'new column name'] = 'value if condition is met'

df.loc[df['set_of_numbers'] <= 4, 'equal_or_lower_than_4?'] = 'True' 
df.loc[df['set_of_numbers'] > 4, 'equal_or_lower_than_4?'] = 'False'

#df['new column name'] = df['column name'].apply(lambda x: 'value if condition is met' if x condition else 'value if condition is not met')

df['equal_or_lower_than_4?'] = df['set_of_numbers'].apply(lambda x: 'True' if x <= 4 else 'False')

#IF condition – strings
df.loc[df['First_name'] == 'Bill', 'name_match'] = 'Match'

Frequency, rate

h = ret.copy()
for j in range(len(h.columns)):
    for i in range(len(h)):
       h.iloc[i,j] = ret.iloc[i+1:,j].sum()/(ret.iloc[i:,j].sum())

h.reset_index(inplace = True)
h

Multiline Plot

plt.rc('figure', figsize=(16, 8))

plt.plot( 'cnt', 'High', data=h ,marker='', color='green', linewidth=2 )
plt.plot( 'cnt', 'Medium', data=h, marker='', color='olive', linewidth=2)
plt.plot( 'cnt', 'Low', data=h, marker='', color='red', linewidth=2, linestyle='dashed')
plt.legend()

Customer Lifetime Value with probabe of come

h = pd.DataFrame(columns = ['F', 'cnt'])
for i in range(10):
    h.loc[i] = [i + 1, (ret['DistCount'].iloc[i+1:].sum()/(ret['DistCount'].iloc[i:].sum()))]
    
h

clv = pd.DataFrame(columns = ['F','clv'])
for i in range(10):
    clv.loc[i+1] = [i+1,v2[v2['Order'] == i+1]['paym'].sum()/v2[v2['Order'] == i+1]['NID'].nunique()]

clv = clv.merge(h,on = 'F')

cl = 0
for i in range(10):
    cl = cl +clv.iloc[i,1] * clv.iloc[i-1,2]
cl = cl + clv.iloc[0,1]
cl

F	clv	Ret
1	1.81209	22%
2	3.14298	52%
3	4.18291	65%
4	4.41992	70%
5	4.63124	57%
6	5.80514	48%
7	3.34229	86%
8	6.49992	25%
9	4.48067	67%
10	6.0045	0%

Import all csv files from a folder

import pandas as pd
import os

path = os.getcwd()
files = os.listdir(path)

files_csv = [f for f in files if f[-3:] == 'csv']

df = pd.DataFrame()
for f in files_csv:
    data = pd.read_csv(f)
    df = df.append(data)

loc iloc

df.iloc[:,1:]

وبلاگ شخصی محمد حسین ابراهیم‌زاده اصفهانی

Some Pandas sample Codes

Categories

Pandas Group by sample

Group by with Name

Pivot table

Put first order detail in the row.

python seaborn Heatmap

Orders, Next/First order month

Nth customer Order

Rename Columns

Pie Chart

Retention

Group by with defined Lambda functions

Save To Excel

Categorize the Range

Customer Order Number

total row

IF condition

Frequency, rate

Multiline Plot

Customer Lifetime Value with probabe of come

Import all csv files from a folder

loc iloc

وبلاگ شخصی محمد حسین ابراهیم‌زاده اصفهانی