Combine Data Collected by Paper Questionnaire, Google Survey Forms and ODK Collect and Automate Processes and Analyses
Posted: Mon May 17, 2021 7:01 pm
The Python snippet below intelligently combines datasets collected by various forms such as hand-engineered data or data collected by paper questionnaire, data collected by Google survey forms and those collected by ODK Collect. The aim is to speed up data collection and then automate processes to visualize and analyze the results.
Combine data
Data extracted from paper questionnaire
Data collected via Google Survey form
Data Collected using ODK Collect
The combined dataset is contained in the Master_Sheet of the spreadsheet above.
Finally, we can run the code below to generate plots and various statistics. In practice, as the dataset grow, we need to only select and add the appropriate presentation methods and discard the less useful ones.
Visualize data
Combine data
- import gspread
- import urllib.request
- urllib.request.urlretrieve("https://www.dropbox.com/s/m728v370159b2xm/credentials.json?dl=1", "credentials.json")
- gc = gspread.service_account(filename="credentials.json")
- #We can open spreadsheets by Name, or by url or by key (.open(), open_by_url(), open_by_key())
- #Let's open our spreadsheets by their keys
- wb1 = gc.open_by_key("1BC48PKPZW71AC6hOn1SNesvZk1PoBjE4wXwl1YlWFpY")
- wb2 = gc.open_by_key("1UnkRYcOhLFMgyT_CzByvupvdaD5cL5b_nCcOeoy1uy8")
- wb3 = gc.open_by_key("1E2WYRF9nAxtmUW57Ng05ZsH_0R5UZ0BLLaLcAy9tJzM")
- #Let's rename some the headers in workbook 1
- ws1 = wb1.sheet1
- import numpy as np
- array = np.array([["Name", "Age", "Height", "Date"]])
- # Write the array to worksheet starting from the cell B1
- ws1.update('B1', array.tolist())
- #Let's create a new sheet named Master_sheet under the spreadsheet ODK Test Spreadsheet
- #newsheet = wb1.add_worksheet(title="Master_Sheet", rows="1024", cols="20")
- #We can create under the ODK Test Spreadsheet Google sheet copies of the Google form and manually collected data
- #newsheet = wb1.add_worksheet(title="From_Gsurvey_form", rows="1024", cols="20")
- #newsheet = wb1.add_worksheet(title="Collected_Manually", rows="1024", cols="20")
- #Let's copy data from the two sheets in different spreadsheets to the two newly created sheets under ODK Test Spreadsheet
- ws2 = wb2.sheet1
- ws3 = wb3.sheet1
- sheet1 = ws1.get_all_values()
- sheet2 = ws2.get_all_values()
- sheet3 = ws3.get_all_values()
- #Dataframes
- import pandas as pd
- df1 = pd.DataFrame.from_records(sheet1[1:], columns=sheet1[0])
- df2 = pd.DataFrame.from_records(sheet2[1:], columns=sheet2[0])
- df3 = pd.DataFrame.from_records(sheet3[1:], columns=sheet3[0])
- all_sheets = pd.concat([df1, df2, df3], axis=1)
- #Drop all NANs
- clean_df = all_sheets.fillna("")
- #Let's create a list of lists for the dataframes including columns
- combined_Data = [clean_df.columns.tolist()] + clean_df.to_numpy().tolist()
- mastersheet = wb1.worksheet("Master_Sheet")
- #Let's update the Master Sheet, start pasting from cell A1
- mastersheet.update("A1", combined_Data, value_input_option="USER_ENTERED")
- import os
- os.remove("credentials.json")
Data extracted from paper questionnaire
Data collected via Google Survey form
Data Collected using ODK Collect
The combined dataset is contained in the Master_Sheet of the spreadsheet above.
Finally, we can run the code below to generate plots and various statistics. In practice, as the dataset grow, we need to only select and add the appropriate presentation methods and discard the less useful ones.
Visualize data
- #Plot some graph
- #Import required libraries
- import gspread
- import urllib.request
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- import pandas as pd
- #Let's Read Data from Google Sheets into Pandas without the Google Sheets API
- #Useful for multiple sheets
- sheet_id = "1BC48PKPZW71AC6hOn1SNesvZk1PoBjE4wXwl1YlWFpY"
- sheet_name = "Master_Sheet"
- url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
- #Get Pandas dataframe
- dataset = pd.read_csv(url)
- #worksheet = sh.sheet1
- #age = worksheet.col_values(3)[1:]
- age = dataset["Age"]
- print("Ages:", age)
- #Height = worksheet.col_values(4)[1:]
- Height = dataset["Height"]
- print("Heights:", Height)
- #Pandas is extremely very useful for Google spreadsheets
- #Convert the json to Pandas dataframe
- #Get all data records as dictionary
- #data = worksheet.get_all_records()
- #df = pd.DataFrame.from_dict(data)
- #Let's get some statistics
- #age_arr = np.array(age)
- #age_array = age_arr.astype(float)
- #h_arr = np.array(Height)
- #h_array = h_arr.astype(float)
- print("Average Age:", np.mean(age))
- print("Mean Height:", np.mean(Height))
- print("Minimum and Maximum Age:", np.min(age), np.max(age))
- print("Minimum and Maximum Height:", np.min(Height), np.max(Height))
- #Let's visualize
- #Graph styles and font size
- sns.set_style('darkgrid') # darkgrid, white grid, dark, white and ticks
- plt.rc('axes', titlesize=18) # fontsize of the axes title
- plt.rc('axes', labelsize=14) # fontsize of the x and y labels
- plt.rc('xtick', labelsize=13) # fontsize of the tick labels
- plt.rc('ytick', labelsize=13) # fontsize of the tick labels
- plt.rc('legend', fontsize=13) # legend fontsize
- plt.rc('font', size=13) # controls default text sizes
- #sns list of color plettes
- #print(sns.color_palette('deep'), sns.color_palette("pastel"), sns.color_palette("Set2"))
- #If your file only has one sheet, replace sheet_url
- #sheet_url = “https://docs.google.com/spreadsheets/d/1XqOtPkiE_Q0dfGSoyxrH730RkwrTczcRbDeJJpqRByQ/edit#gid=0"
- #url_1 = sheet_url.replace(‘/edit#gid=’, ‘/export?format=csv&gid=’)
- #print(dataset)
- #Names = worksheet.col_values(2)[1:]
- #Names = data["Name"]
- #print(Names)
- df_names = dataset["Name"]
- df_ages = dataset["Age"]
- df_Heights = dataset["Height"]
- print(df_names)
- #Preprocessing
- plots = dataset.groupby(['Name'], as_index=False).mean()
- #print(plots)
- #Bar Plot in MatplotLib with plt.bar()
- #Names vs Age
- plt.figure(figsize=(10,5), tight_layout=True)
- colors = sns.color_palette('pastel')
- plt.bar(dataset['Name'], dataset['Age'], color=colors[:5])
- plt.xlabel('Name')
- plt.xticks(rotation=90)
- plt.ylabel('Age')
- plt.title('Barplot')
- plt.show()
- #Name Vs Height
- plt.figure()
- plt.figure(figsize=(10,5), tight_layout=True)
- colors = sns.color_palette('deep')
- plt.bar(dataset['Name'], dataset['Height'], color=colors[:6])
- plt.xlabel('Name')
- plt.xticks(rotation=90)
- plt.ylabel('Height')
- plt.title('Barplot')
- plt.show()
- #Bar Plot in Seaborn with sns.barplot()
- plt.figure(figsize=(10,5), tight_layout=True)
- ax = sns.barplot(x=dataset['Name'], y=dataset['Age'], palette='pastel', ci=None)
- ax.set(title='Barplot with Seaborn', xlabel='Names', ylabel='Age')
- plt.xticks(rotation=90)
- plt.show()
- #Barplot grouped data by "n" variables
- plt.figure(figsize=(12, 6), tight_layout=True)
- ax = sns.barplot(x=dataset['Age'], y=dataset['Height'], hue=dataset['Name'], palette='pastel')
- ax.set(title='Age vs Height' ,xlabel='Age', ylabel='Height')
- ax.legend(title='Names', title_fontsize='13', loc='upper right')
- plt.show()
- #Histograms with plt.hist() or sns.histplot()
- plt.figure(figsize=(10,6), tight_layout=True)
- bins = [40, 50, 60, 70, 80]
- # matplotlib
- plt.hist(dataset['Height'], bins=bins, color=sns.color_palette('Set2')[2], linewidth=2)
- plt.title('Histogram')
- plt.xlabel('Height (cm)')
- plt.ylabel('Count')
- # seaborn
- ax = sns.histplot(data=dataset, x='Height', bins=bins, color=sns.color_palette('Set2')[2], linewidth=2)
- ax.set(title='Histogram', xlabel='Height (cm)', ylabel='Count')
- plt.show()
- #Boxplot
- plt.figure(figsize=(10,6), tight_layout=True)
- ax = sns.boxplot(data=dataset, x='Name', y='Age', palette='Set2', linewidth=2.5)
- ax.set(title='Boxplot', xlabel='Names', ylabel='Age (Years)')
- plt.xticks(rotation=90)
- plt.show()
- #Scatter plot
- plt.figure(figsize=(10,6), tight_layout=True)
- ax = sns.scatterplot(data=dataset, x='Age', y='Height', hue='Name', palette='Set2', s=60)
- ax.set(xlabel='Age (Years)', ylabel='Height (cms)')
- ax.legend(title='People', title_fontsize = 12)
- plt.show()
- #Something else
- pivot = dataset.groupby(['Name'], as_index=False).mean()
- relationship = pivot.loc[:,"Age":"Height"]
- print(relationship)
- #Plot some graph
- charts = ["bar", "line", "barh", "hist", "box", "kde", "density", "area"]
- for chart_type in charts:
- relationship.plot(kind="%s" % chart_type) #Replace bar with line, barh, hist, box, kde, density, area
- plt.title("%s plot" % chart_type)
- plt.show()
- #Seaborn
- plt.figure()
- sns.set_style("darkgrid")
- sns.lineplot(data = dataset, x = "Age", y = "Height")
- plt.show()
- plt.figure()
- sns.set_style("darkgrid")
- sns.lineplot(data = dataset, x = "Height", y = "Age")
- plt.show()
- #replot
- plt.figure()
- sns.set_theme(style="darkgrid")
- sns.relplot(x="Age", y="Height", hue="Name", data=dataset);
- plt.show()
- #Hexbin
- #Split the plotting window into 20 hexbins
- plt.figure()
- nbins = 15
- plt.title('Hexbin')
- plt.hexbin(dataset["Age"], dataset["Height"], gridsize=nbins, color=colors[:5])
- plt.show()
- #Hexbin 2
- #Split the plotting window into 20 hexbins
- plt.figure()
- nbins = 15
- plt.title('Hexbin')
- plt.hexbin(dataset["Age"], dataset["Height"], gridsize=nbins, cmap=plt.cm.BuGn_r)
- plt.show()
- #2-D Hist
- plt.figure()
- plt.title('2-D Histogram')
- plt.hist2d(dataset["Age"], dataset["Height"], bins=nbins, color=colors[:6])
- plt.show()
- #2-D Hist 2
- plt.figure()
- plt.title('2-D Histogram')
- plt.hist2d(dataset["Age"], dataset["Height"], bins=nbins, cmap=plt.cm.BuGn_r)
- plt.show()
- #Set variables
- x = dataset["Age"]
- y = dataset["Height"]
- z = dataset["Name"]
- #Linear Regression
- plt.figure()
- sns.regplot(x = x, y = y, data=dataset);
- plt.show()
- plt.figure()
- sns.jointplot(x=x, y=y, data=dataset, kind="reg");
- plt.show()
- #Set seaborn style
- sns.set_style("white")
- # Basic 2D density plot
- plt.figure()
- sns.kdeplot(x=x, y=y)
- plt.show()
- # Custom the color, add shade and bandwidth
- plt.figure()
- sns.kdeplot(x=x, y=y, cmap="Reds", shade=True, bw_adjust=.5)
- plt.show()
- # Add thresh parameter
- plt.figure()
- sns.kdeplot(x=x, y=y, cmap="Blues", shade=True, thresh=0)
- plt.show()
- #Joint plot
- plt.figure()
- sns.jointplot(x = x,y = y,data = dataset,kind = 'hex')
- plt.show()