Let's use this excel data and adopt this article to see panda's pivot table in action. We will mostly be using the module pandas DataFrame.xs.
Now everything is compiled in a single code below, uncomment (remove # before "print(pt)) to check the output in every stage from top - bottom. You do not need to set up your local environment to run the code, run it here.
- #Import the required libraries
- import pandas as pd
- import numpy as np
- import urllib.request
- #print("This is Pandas version:", pd.__version__)
- #Download and read the data into DataFrame
- urllib.request.urlretrieve("https://pbpython.com/extras/sales-funnel.xlsx", "sales-funnel.xlsx")
- df = pd.read_excel("./sales-funnel.xlsx")
- #print(df)
- #print(df.head())
- #Let’s define the status column as a category and set the order we want to view
- df["Status"] = df["Status"].astype("category")
- df["Status"].cat.set_categories(["won","pending","presented","declined"],inplace=True)
- #Pivoting the data
- #The simplest pivot table must have a dataframe and an index. Let’s use the Name as our index
- pt = pd.pivot_table(df,index=["Name"])
- #print(pt)
- #Let's put multiple indices via list
- pt = pd.pivot_table(df,index=["Name","Rep","Manager"])
- #print(pt)
- #Let's change index and look by Manager and Rep
- #You can see that the pivot table is smart enough, it aggregates the data and summarizes it by grouping the reps with their managers
- #This is the power of pandas pito table
- pt = pd.pivot_table(df,index=["Manager","Rep"])
- #print(pt)
- #For this purpose, we do not need Account and Quantity columns,
- #Let’s omit them by explicitly defining the columns we need using the values field
- pt = pd.pivot_table(df,index=["Manager","Rep"],values=["Price"])
- #print(pt)
- #The price column automatically averages the data, we can however count or sum.
- #Addition is achieved using aggfunc and np.sum
- pt = pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],aggfunc=np.sum)
- #print(pt)
- #aggfunc can take a list of functions such as numpy.mean function and len to get a count
- pt = pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],aggfunc=[np.mean,len])
- #print(pt)
- #Columns vs Values
- """If we want to break down sales by the products, the columns variable allows us to define one or more columns.
- columns optionally provide an additional way to segment the actual values of your interest.
- The aggregation functions are applied to the list of values"""
- pt = pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],
- columns=["Product"],aggfunc=[np.sum])
- #print(pt)
- #We can remove the NAN's as seen after printing the pivot table above by setting fill_value to 0.
- pt = pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],
- columns=["Product"],aggfunc=[np.sum],fill_value=0)
- #print(pt)
- #Let's add quantity to the values list
- pt = pd.pivot_table(df,index=["Manager","Rep"],values=["Price","Quantity"],
- columns=["Product"],aggfunc=[np.sum],fill_value=0)
- #print(pt)
- #You can move items to the index to get a different visual representation.
- #Let's remove Product from the columns and add to the index
- pt = pd.pivot_table(df,index=["Manager","Rep","Product"],
- values=["Price","Quantity"],aggfunc=[np.sum],fill_value=0)
- #print(pt)
- #Let's display some totals, to achieve this we use margins=True
- pt = pd.pivot_table(df,index=["Manager","Rep","Product"],
- values=["Price","Quantity"],
- aggfunc=[np.sum,np.mean],fill_value=0,margins=True)
- #print(pt)
- #Let’s move the analysis up a level and look at the data pipeline at the manager level
- #Notice how the status is ordered based on earlier category definition
- pt = pd.pivot_table(df,index=["Manager","Status"],values=["Price"],
- aggfunc=[np.sum],fill_value=0,margins=True)
- #print(pt)
- #It is very handy to pass a dictionary to the aggfunc so that we can perform different functions on each of the values we select
- pt = pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
- aggfunc={"Quantity":len,"Price":np.sum},fill_value=0)
- #print(pt)
- #We can provide a list of aggfuncs to apply to each value too
- tb = pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
- aggfunc={"Quantity":len,"Price":[np.sum,np.mean]},fill_value=0)
- #print(tb)
- tb
- #Advanced Pivot Table Filtering
- #You can filter DataFrame generated data using your standard DataFrame functions
- #Suppoes we want to look at just one manager:
- pt = tb.query('Manager == ["Debra Henley"]')
- #print(tb)
- #We can look at all of our pending and won deals
- pt = tb.query('Status == ["pending","won"]')
- print(pt)