Pandas V.S SQL

Categories

If you knew SQL before and want to migrate to Python, you can use this article.

TiTle	SQL	Pandas	Desc
Simple	SELECT total_bill, tip, smoker, time FROM tips LIMIT ۵;	tips[[‘total_bill’, ‘tip’, ‘smoker’, ‘time’]].head(۵)
Where	SELECT * FROM tips WHERE time = ‘Dinner’ LIMIT ۵;	tips[tips[‘time’] == ‘Dinner’].head(۵)
Multiple conditions	SELECT * FROM tips WHERE time = ‘Dinner’ AND tip > ۵.۰۰;	tips[(tips[‘time’] == ‘Dinner’) & (tips[‘tip’] > ۵.۰۰)]
	SELECT * FROM frame WHERE col2 IS NULL;	frame[frame[‘col2’].isna()]
Group By	SELECT sex, count(*) FROM tips GROUP BY sex;	tips.groupby(‘sex’).size()	Notice that in the pandas code we used size() and not count(). This is because count() applies the function to each column, returning the number of not null records within each.
	SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;	tips.groupby(‘day’).agg({‘tip’: np.mean, ‘day’: np.size})
	SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;	tips.groupby([‘smoker’, ‘day’]).agg({‘tip’: [np.size, np.mean]})
Unique list	SELECT DISTINCT finalized from tbl	Df.finalized.unique()
INNER JOIN	SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;	pd.merge(df1, df2, on=’key’)	also offers parameters for cases when you’d like to join one DataFrame’s column with another DataFrame’s index.
		pd.merge(df1, indexed_df2, left_on=’key’, right_index=True)
LEFT OUTER JOIN	//show all records from df1 SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key;	pd.merge(df1, df2, on=’key’, how=’left’)
RIGHT JOIN	// show all records from df2 SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key;	pd.merge(df1, df2, on=’key’, how=’right’)
FULL JOIN	//show all records from both tables SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key;	pd.merge(df1, df2, on=’key’, how=’outer’)
UNION	SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;	df1 = pd.DataFrame({‘city’: [‘Chicago’, ‘San Francisco’, ‘New York City’], ‘rank’: range(1, 4)}) /// df2 = pd.DataFrame({‘city’: [‘Chicago’, ‘Boston’, ‘Los Angeles’],’rank’: [1, 4, 5]}) ///pd.concat([df1, df2])
		pd.concat([df1, df2]).drop_duplicates()
Top N rows with offset	SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;	tips.nlargest(10 + 5, columns=’tip’).tail(10)	The OFFSET clause skips the offset rows before beginning to return the rows
Top N rows per group	SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY day, rn;	(tips.assign(rn=tips.sort_values([‘total_bill’], ascending=False) .groupby([‘day’]) .cumcount() + 1) .query(‘rn < 3’) .sort_values([‘day’, ‘rn’]))	the same using rank(method=’first’) function
		(tips.assign(rnk=tips.groupby([‘day’])[‘total_bill’].rank(method=’first’, ascending=False)).query(‘rnk < 3’).sort_values([‘day’, ‘rnk’]))
UPDATE	UPDATE tips SET tip = tip2 WHERE* tip < 2;	tips.loc[tips[‘tip’] < 2, ‘tip’] *= 2
DELETE	DELETE FROM tips WHERE tip > 9;	tips = tips.loc[tips[‘tip’] <= 9]