Pandas V.S SQL

Categories

If you knew SQL before and want to migrate to Python, you can use this article.

TiTle SQL Pandas Desc
Simple SELECT total_bill, tip, smoker, time
FROM tips
LIMIT ۵;
tips[[‘total_bill’, ‘tip’, ‘smoker’, ‘time’]].head(۵)  
Where SELECT *
FROM tips
WHERE time = ‘Dinner’
LIMIT ۵;
tips[tips[‘time’] == ‘Dinner’].head(۵)  
Multiple conditions SELECT *
FROM tips
WHERE time = ‘Dinner’ AND tip > ۵.۰۰;
tips[(tips[‘time’] == ‘Dinner’) & (tips[‘tip’] > ۵.۰۰)]  
  SELECT *
FROM frame
WHERE col2 IS NULL;
frame[frame[‘col2’].isna()]  
Group By SELECT sex, count(*)
FROM tips
GROUP BY sex;
tips.groupby(‘sex’).size() Notice that in the pandas code we used size() and not count(). This is because count() applies the function to each column, returning the number of not null records within each.
  SELECT day, AVG(tip), COUNT(*)
FROM tips
GROUP BY day;
tips.groupby(‘day’).agg({‘tip’: np.mean, ‘day’: np.size})  
  SELECT smoker, day, COUNT(*), AVG(tip)
FROM tips
GROUP BY smoker, day;
tips.groupby([‘smoker’, ‘day’]).agg({‘tip’: [np.size, np.mean]})  
Unique list SELECT DISTINCT finalized from tbl Df.finalized.unique()  
INNER JOIN SELECT *
FROM df1
INNER JOIN df2
  ON df1.key = df2.key;
pd.merge(df1, df2, on=’key’) also offers parameters for cases when you’d like to join one DataFrame’s column with another DataFrame’s index.
    pd.merge(df1, indexed_df2, left_on=’key’, right_index=True)  
LEFT OUTER JOIN  //show all records from df1
SELECT *
FROM df1
LEFT OUTER JOIN df2
  ON df1.key = df2.key;
pd.merge(df1, df2, on=’key’, how=’left’)  
RIGHT JOIN // show all records from df2
SELECT *
FROM df1
RIGHT OUTER JOIN df2
  ON df1.key = df2.key;
pd.merge(df1, df2, on=’key’, how=’right’)  
FULL JOIN //show all records from both tables
SELECT *
FROM df1
FULL OUTER JOIN df2
  ON df1.key = df2.key;
pd.merge(df1, df2, on=’key’, how=’outer’)  
UNION SELECT city, rank
FROM df1
UNION ALL
SELECT city, rank
FROM df2;
df1 = pd.DataFrame({‘city’: [‘Chicago’, ‘San Francisco’, ‘New York City’],  ‘rank’: range(1, 4)}) /// df2 = pd.DataFrame({‘city’: [‘Chicago’, ‘Boston’, ‘Los Angeles’],’rank’: [1, 4, 5]}) ///pd.concat([df1, df2])  
    pd.concat([df1, df2]).drop_duplicates()  
Top N rows with offset SELECT * FROM tips
ORDER BY tip DESC
LIMIT 10 OFFSET 5;
tips.nlargest(10 + 5, columns=’tip’).tail(10) The OFFSET  clause skips the offset rows before beginning to return the rows
Top N rows per group SELECT * FROM (
  SELECT
    t.*,
    ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
  FROM tips t
)
WHERE rn < 3
ORDER BY day, rn;
(tips.assign(rn=tips.sort_values([‘total_bill’], ascending=False) .groupby([‘day’]) .cumcount() + 1) .query(‘rn < 3’) .sort_values([‘day’, ‘rn’])) the same using rank(method=’first’) function
    (tips.assign(rnk=tips.groupby([‘day’])[‘total_bill’].rank(method=’first’, ascending=False)).query(‘rnk < 3’).sort_values([‘day’, ‘rnk’]))

 

 
UPDATE

 

UPDATE tips

SET tip = tip*2

WHERE tip < 2;

 

tips.loc[tips[‘tip’] < 2, ‘tip’] *= 2

 

 
DELETE

 

DELETE FROM tips

WHERE tip > 9;

 

tips = tips.loc[tips[‘tip’] <= 9]