Some Pandas sample Codes

Categories

Pandas Group by sample

(deals.groupby(['VendorID']).agg({'Category':'nunique'})>1)['Category'].value_counts()

Group by with Name

df.groupby('Column0').agg(Name1=('Column1','count') ,Name2=('Column2' , 'nunique' ))

Pivot table

pd.pivot_table(df, values='D', index=['A', 'B'],columns=['C'], aggfunc=np.sum)

Put first order detail in the row.

vcat = deals.groupby(['VendorID','Category']).agg({'DealID':'nunique'}).sort_values('DealID',ascending=False).reset_index()
vcat = vcat.groupby('VendorID').agg({'Category':'first'})
vcat
vds = deals.groupby('VendorID').agg(dls = ('DealID','nunique'),
                              NumberOfVendorRating = ('NumberOfVendorRating','first'),
                              AverageVendorRating = ('AverageVendorRating','first'),
                              UniquePageView = ('UniquePageView','first'),
                              AddToCart = ('AddToCart','first')                              
                             ).reset_index()
vds

python seaborn Heatmap

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (15,8))
sns.heatmap(vendors.corr() , annot = True , cmap="Reds" , vmin = 0.7)

plt.figure(figsize = (15,8))
sns.heatmap(vendors.merge(vccat[vccat['Category'] == 'Leisure & Sports'] , on = 'VendorID').corr() , annot = True , cmap="Purples" , vmin = 0.65)

Orders, Next/First order month

Orders = df[['OrderID','VendorID','ShamsiYearMonth','CustomerID','NthCustomerOrder']].dropna().reset_index()
Orders.drop(columns = 'index' , inplace = True)
Orders.drop_duplicates(keep='first',inplace= True , ignore_index=True)

#Next Order Month
Orders.sort_values(['CustomerID','ShamsiYearMonth'] , inplace=True , ascending=False)
Orders['nextOrder'] = Orders.groupby('CustomerID')['ShamsiYearMonth'].diff()*-1
Orders.loc[(Orders.nextOrder == -0.00),'nextOrder'] = 0
Orders['nextOrder'].value_counts()

#First Order Month
Orders.set_index(keys='CustomerID', inplace=True)
# Orders['FirstOrderMonth'] = Orders.groupby('CustomerID')['ShamsiYearMonth'].min()
Orders['FirstOrderMonth'] = Orders[(Orders['NthCustomerOrder'] ==1)]['ShamsiYearMonth']
Orders.reset_index(inplace=True)

Nth customer Order

v2['Order']=v2.groupby(['NID']).Month.transform(lambda x : pd.factorize(x)[0]+1)

Rename Columns

df.rename(columns={'W':'A', 'X':'B', etc})

Pie Chart

import matplotlib.pyplot as plt

fig1, ax1 = plt.subplots()
labels = v2.index
ax1.pie(v[["cnt","Pdate"]].pivot_table(index="cnt" , aggfunc="count" ), autopct='%1.1f%%',labels=labels,
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

Retention

g = pd.DataFrame(df['Frequency'].value_counts())
g

h = pd.DataFrame(columns = ['F', 'Retention Rate'])
for i in range(291):
    h.loc[i] = [i + 1, (g['Frequency'].iloc[i+1:].sum()/(g['Frequency'].iloc[i:].sum()))]


h.plot(x = 'F', y = 'Retention Rate', figsize=[16,8], legend=False);
plt.xlim(0,60)
# plt.xticks([1,2,3,4,5,6,7,8,9,10,15,20,30,40,50,60]);
# plt.yticks([0,0.20,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1.0]);
plt.xlabel('Order Number')
plt.ylabel('Retention Rate')


h.plot(x = 'F', y = 'Retention Rate', figsize=[16,8], legend=False);
plt.xlim(0,60)
plt.xticks([1,2,3,4,5,6,7,8,9,10,20,30,40,50,60]);
plt.yticks([0,0.20,0.40,0.50,0.56,0.60,0.70,0.80,0.90,1.0]);
plt.xlabel('Order Number')
plt.ylabel('Retention Rate')
plt.axvline(x = 2, linewidth = 2, color = 'grey')
plt.axhline(y = 0.56, linewidth = 2, color = 'grey')
plt.axvline(x = 5, linewidth = 2, color = 'cyan')
plt.axhline(y = 0.71, linewidth = 2, color = 'cyan')
plt.axvline(x = 9, linewidth = 2, color = 'green')
plt.axhline(y = 0.8, linewidth = 2, color = 'green')

Group by with defined Lambda functions

PRESENT = dt.datetime(2020,2,5)
f_recency = lambda date: (PRESENT - date.max()).days
f_length = lambda date: (date.max() - date.min()).days
f_unique = lambda value: value.nunique()
f_second_date = lambda date: (date.nsmallest(2).iloc[-1] - date.min()).days
rfm = filtered_orders.groupby('customer_id').agg({'price': [('Monetary','sum'), ('Monetary_Mean', 'mean'), ('Monetary Median', 'median')], 
                                                   'date': [('Recency', f_recency), ('Length', f_length), ('Second_Order_Time', f_second_date)], 
                                                   'order_id': [('Frequency', 'count')]
                                                     })
rfm.columns = rfm.columns.get_level_values(1)
rfm.reset_index(inplace=True)
rfm['Second_Order_Time'] = (-1)*rfm['Second_Order_Time']

Save To Excel


from datetime import datetime

stri = 'The related Name' + datetime.now().strftime("-%Y%m%d-%H%M%S") + '.xlsx'
writer = pd.ExcelWriter(stri)

pstg.to_excel(writer , sheet_name = 'Percetage')
writer.save()

Categorize the Range

df[‘PriceBin’] = pd.cut(df[‘PriceAvg’], bins = 3)
df[‘PriceBin’].value_counts()

(54060.0, 2040000.0]      209 
(2040000.0, 4020000.0]      4 
(4020000.0, 6000000.0]      1 
Name: PriceBin, dtype: int64


df[‘PriceBin’] = pd.qcut(df[‘PriceAvg’], q=3)
df[‘PriceBin’].value_counts().sort_index()

(59999.999, 210000.0]    77
(210000.0, 315000.0]     66
(315000.0, 6000000.0]    71
Name: PriceBin, dtype: int64
h = df.groupby(‘PriceBin’, as_index=False).median()[‘SalesAvg’]

h = pd.DataFrame(h)
h.reset_index(inplace=True)
h


PriceBin SalesAvg
0(59999.999, 210000.0] 42.0000001
(210000.0, 315000.0] 145.1666672
(315000.0, 6000000.0] 114.200000

Customer Order Number

v2['Order']=v2.groupby(['NID']).Month.transform(lambda x : pd.factorize(x)[0]+1)

total row

m = df.groupby(['Level','Company','Item'])['Value'].sum().unstack(level=['Company','Item'])
m = m.assign(total=m.sum(1))
m = m.stack(level='Company')
m = m.assign(total=m.sum(1))
m = m.stack(level='Item')

IF condition

#df.loc[df['column name'] condition, 'new column name'] = 'value if condition is met'

df.loc[df['set_of_numbers'] <= 4, 'equal_or_lower_than_4?'] = 'True' 
df.loc[df['set_of_numbers'] > 4, 'equal_or_lower_than_4?'] = 'False' 
#df['new column name'] = df['column name'].apply(lambda x: 'value if condition is met' if x condition else 'value if condition is not met')

df['equal_or_lower_than_4?'] = df['set_of_numbers'].apply(lambda x: 'True' if x <= 4 else 'False')
#IF condition – strings
df.loc[df['First_name'] == 'Bill', 'name_match'] = 'Match' 

Frequency, rate

h = ret.copy()
for j in range(len(h.columns)):
    for i in range(len(h)):
       h.iloc[i,j] = ret.iloc[i+1:,j].sum()/(ret.iloc[i:,j].sum())

h.reset_index(inplace = True)
h

Multiline Plot

plt.rc('figure', figsize=(16, 8))

plt.plot( 'cnt', 'High', data=h ,marker='', color='green', linewidth=2 )
plt.plot( 'cnt', 'Medium', data=h, marker='', color='olive', linewidth=2)
plt.plot( 'cnt', 'Low', data=h, marker='', color='red', linewidth=2, linestyle='dashed')
plt.legend()

Customer Lifetime Value with probabe of come

h = pd.DataFrame(columns = ['F', 'cnt'])
for i in range(10):
    h.loc[i] = [i + 1, (ret['DistCount'].iloc[i+1:].sum()/(ret['DistCount'].iloc[i:].sum()))]
    
h
clv = pd.DataFrame(columns = ['F','clv'])
for i in range(10):
    clv.loc[i+1] = [i+1,v2[v2['Order'] == i+1]['paym'].sum()/v2[v2['Order'] == i+1]['NID'].nunique()]

clv = clv.merge(h,on = 'F')
cl = 0
for i in range(10):
    cl = cl +clv.iloc[i,1] * clv.iloc[i-1,2]
cl = cl + clv.iloc[0,1]
cl
FclvRet
11.8120922%
23.1429852%
34.1829165%
44.4199270%
54.6312457%
65.8051448%
73.3422986%
86.4999225%
94.4806767%
106.00450%

Import all csv files from a folder

import pandas as pd
import os

path = os.getcwd()
files = os.listdir(path)

files_csv = [f for f in files if f[-3:] == 'csv']

df = pd.DataFrame()
for f in files_csv:
    data = pd.read_csv(f)
    df = df.append(data)

loc iloc

df.iloc[:,1:]