If you knew SQL before and want to migrate to Python, you can use this article.
TiTle | SQL | Pandas | Desc |
Simple | SELECT total_bill, tip, smoker, time FROM tips LIMIT ۵; |
tips[[‘total_bill’, ‘tip’, ‘smoker’, ‘time’]].head(۵) | |
Where | SELECT * FROM tips WHERE time = ‘Dinner’ LIMIT ۵; |
tips[tips[‘time’] == ‘Dinner’].head(۵) | |
Multiple conditions | SELECT * FROM tips WHERE time = ‘Dinner’ AND tip > ۵.۰۰; |
tips[(tips[‘time’] == ‘Dinner’) & (tips[‘tip’] > ۵.۰۰)] | |
SELECT * FROM frame WHERE col2 IS NULL; |
frame[frame[‘col2’].isna()] | ||
Group By | SELECT sex, count(*) FROM tips GROUP BY sex; |
tips.groupby(‘sex’).size() | Notice that in the pandas code we used size() and not count(). This is because count() applies the function to each column, returning the number of not null records within each. |
SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day; |
tips.groupby(‘day’).agg({‘tip’: np.mean, ‘day’: np.size}) | ||
SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day; |
tips.groupby([‘smoker’, ‘day’]).agg({‘tip’: [np.size, np.mean]}) | ||
Unique list | SELECT DISTINCT finalized from tbl | Df.finalized.unique() | |
INNER JOIN | SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key; |
pd.merge(df1, df2, on=’key’) | also offers parameters for cases when you’d like to join one DataFrame’s column with another DataFrame’s index. |
pd.merge(df1, indexed_df2, left_on=’key’, right_index=True) | |||
LEFT OUTER JOIN | //show all records from df1 SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key; |
pd.merge(df1, df2, on=’key’, how=’left’) | |
RIGHT JOIN | // show all records from df2 SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key; |
pd.merge(df1, df2, on=’key’, how=’right’) | |
FULL JOIN | //show all records from both tables SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key; |
pd.merge(df1, df2, on=’key’, how=’outer’) | |
UNION | SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2; |
df1 = pd.DataFrame({‘city’: [‘Chicago’, ‘San Francisco’, ‘New York City’], ‘rank’: range(1, 4)}) /// df2 = pd.DataFrame({‘city’: [‘Chicago’, ‘Boston’, ‘Los Angeles’],’rank’: [1, 4, 5]}) ///pd.concat([df1, df2]) | |
pd.concat([df1, df2]).drop_duplicates() | |||
Top N rows with offset | SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5; |
tips.nlargest(10 + 5, columns=’tip’).tail(10) | The OFFSET clause skips the offset rows before beginning to return the rows |
Top N rows per group | SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY day, rn; |
(tips.assign(rn=tips.sort_values([‘total_bill’], ascending=False) .groupby([‘day’]) .cumcount() + 1) .query(‘rn < 3’) .sort_values([‘day’, ‘rn’])) | the same using rank(method=’first’) function |
(tips.assign(rnk=tips.groupby([‘day’])[‘total_bill’].rank(method=’first’, ascending=False)).query(‘rnk < 3’).sort_values([‘day’, ‘rnk’]))
|
|||
UPDATE
|
UPDATE tips
SET tip = tip*2 WHERE tip < 2;
|
tips.loc[tips[‘tip’] < 2, ‘tip’] *= 2
|
|
DELETE
|
DELETE FROM tips
WHERE tip > 9;
|
tips = tips.loc[tips[‘tip’] <= 9]
|