Combine Data Collected by Paper Questionnaire, Google Survey Forms and ODK Collect and Automate Processes and Analyses

Including Cython, Jython, IronPython, PyPy, Django framework, and interpreters: Ipython, IPython Jupyter/Notebook, CPython


Post Reply
User avatar
Eli
Senior Expert Member
Reactions: 183
Posts: 5334
Joined: 9 years ago
Location: Tanzania
Has thanked: 75 times
Been thanked: 88 times
Contact:

#1

The Python snippet below intelligently combines datasets collected by various forms such as hand-engineered data or data collected by paper questionnaire, data collected by Google survey forms and those collected by ODK Collect. The aim is to speed up data collection and then automate processes to visualize and analyze the results.

Combine data

  1. import gspread
  2. import urllib.request
  3. urllib.request.urlretrieve("https://www.dropbox.com/s/m728v370159b2xm/credentials.json?dl=1", "credentials.json")
  4.  
  5. gc = gspread.service_account(filename="credentials.json")
  6. #We can open spreadsheets by Name, or by url or by key (.open(), open_by_url(), open_by_key())
  7. #Let's open our spreadsheets by their keys
  8. wb1 = gc.open_by_key("1BC48PKPZW71AC6hOn1SNesvZk1PoBjE4wXwl1YlWFpY")
  9. wb2 = gc.open_by_key("1UnkRYcOhLFMgyT_CzByvupvdaD5cL5b_nCcOeoy1uy8")
  10. wb3 = gc.open_by_key("1E2WYRF9nAxtmUW57Ng05ZsH_0R5UZ0BLLaLcAy9tJzM")
  11.  
  12. #Let's rename some the headers in workbook 1
  13. ws1 = wb1.sheet1
  14. import numpy as np
  15. array = np.array([["Name", "Age", "Height", "Date"]])
  16. # Write the array to worksheet starting from the cell B1
  17. ws1.update('B1', array.tolist())
  18.  
  19. #Let's create a new sheet named Master_sheet under the spreadsheet ODK Test Spreadsheet
  20. #newsheet = wb1.add_worksheet(title="Master_Sheet", rows="1024", cols="20")
  21.  
  22. #We can create under the ODK Test Spreadsheet Google sheet copies of the Google form and manually collected data
  23. #newsheet = wb1.add_worksheet(title="From_Gsurvey_form", rows="1024", cols="20")
  24. #newsheet = wb1.add_worksheet(title="Collected_Manually", rows="1024", cols="20")
  25.  
  26. #Let's copy data from the two sheets in different spreadsheets to the two newly created sheets under ODK Test Spreadsheet
  27. ws2 = wb2.sheet1
  28. ws3 = wb3.sheet1
  29.  
  30. sheet1 = ws1.get_all_values()
  31. sheet2 = ws2.get_all_values()
  32. sheet3 = ws3.get_all_values()
  33.  
  34. #Dataframes
  35. import pandas as pd
  36. df1 = pd.DataFrame.from_records(sheet1[1:], columns=sheet1[0])
  37. df2 = pd.DataFrame.from_records(sheet2[1:], columns=sheet2[0])
  38. df3 = pd.DataFrame.from_records(sheet3[1:], columns=sheet3[0])
  39.  
  40. all_sheets = pd.concat([df1, df2, df3], axis=1)
  41.  
  42. #Drop all NANs
  43. clean_df = all_sheets.fillna("")
  44. #Let's create a list of lists for the dataframes including columns
  45. combined_Data = [clean_df.columns.tolist()] + clean_df.to_numpy().tolist()
  46.  
  47. mastersheet = wb1.worksheet("Master_Sheet")
  48.  
  49. #Let's update the Master Sheet, start pasting from cell A1
  50. mastersheet.update("A1", combined_Data, value_input_option="USER_ENTERED")
  51.  
  52. import os
  53. os.remove("credentials.json")


Data extracted from paper questionnaire



Data collected via Google Survey form



Data Collected using ODK Collect



The combined dataset is contained in the Master_Sheet of the spreadsheet above.


Finally, we can run the code below to generate plots and various statistics. In practice, as the dataset grow, we need to only select and add the appropriate presentation methods and discard the less useful ones.

Visualize data

  1. #Plot some graph
  2. #Import required libraries
  3. import gspread
  4. import urllib.request
  5. import numpy as np
  6. import matplotlib.pyplot as plt
  7. import seaborn as sns
  8. import pandas as pd
  9.  
  10. #Let's Read Data from Google Sheets into Pandas without the Google Sheets API
  11. #Useful for multiple sheets
  12. sheet_id = "1BC48PKPZW71AC6hOn1SNesvZk1PoBjE4wXwl1YlWFpY"
  13. sheet_name = "Master_Sheet"
  14. url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
  15.  
  16. #Get Pandas dataframe
  17. dataset = pd.read_csv(url)
  18.  
  19. #worksheet = sh.sheet1
  20. #age = worksheet.col_values(3)[1:]
  21. age = dataset["Age"]
  22. print("Ages:", age)
  23. #Height = worksheet.col_values(4)[1:]
  24. Height = dataset["Height"]
  25. print("Heights:", Height)
  26.  
  27. #Pandas is extremely very useful for Google spreadsheets
  28. #Convert the json to Pandas dataframe
  29. #Get all data records as dictionary
  30. #data = worksheet.get_all_records()
  31. #df = pd.DataFrame.from_dict(data)
  32.  
  33. #Let's get some statistics
  34. #age_arr = np.array(age)
  35. #age_array = age_arr.astype(float)
  36. #h_arr = np.array(Height)
  37. #h_array = h_arr.astype(float)
  38.  
  39. print("Average Age:", np.mean(age))
  40. print("Mean Height:", np.mean(Height))
  41. print("Minimum and Maximum Age:", np.min(age), np.max(age))
  42. print("Minimum and Maximum Height:", np.min(Height), np.max(Height))
  43.  
  44. #Let's visualize
  45. #Graph styles and font size
  46. sns.set_style('darkgrid') # darkgrid, white grid, dark, white and ticks
  47. plt.rc('axes', titlesize=18)     # fontsize of the axes title
  48. plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
  49. plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
  50. plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
  51. plt.rc('legend', fontsize=13)    # legend fontsize
  52. plt.rc('font', size=13)          # controls default text sizes
  53.  
  54. #sns list of color plettes
  55. #print(sns.color_palette('deep'), sns.color_palette("pastel"), sns.color_palette("Set2"))
  56. #If your file only has one sheet, replace sheet_url
  57. #sheet_url = “https://docs.google.com/spreadsheets/d/1XqOtPkiE_Q0dfGSoyxrH730RkwrTczcRbDeJJpqRByQ/edit#gid=0"
  58. #url_1 = sheet_url.replace(‘/edit#gid=’, ‘/export?format=csv&gid=’)
  59.  
  60. #print(dataset)
  61.  
  62. #Names = worksheet.col_values(2)[1:]
  63. #Names = data["Name"]
  64. #print(Names)
  65.  
  66. df_names = dataset["Name"]
  67. df_ages = dataset["Age"]
  68. df_Heights = dataset["Height"]
  69. print(df_names)
  70.  
  71. #Preprocessing
  72. plots = dataset.groupby(['Name'], as_index=False).mean()
  73. #print(plots)
  74.  
  75. #Bar Plot in MatplotLib with plt.bar()
  76. #Names vs Age
  77. plt.figure(figsize=(10,5), tight_layout=True)
  78. colors = sns.color_palette('pastel')
  79. plt.bar(dataset['Name'], dataset['Age'], color=colors[:5])
  80. plt.xlabel('Name')
  81. plt.xticks(rotation=90)
  82. plt.ylabel('Age')
  83. plt.title('Barplot')
  84. plt.show()
  85.  
  86. #Name Vs Height
  87. plt.figure()
  88. plt.figure(figsize=(10,5), tight_layout=True)
  89. colors = sns.color_palette('deep')
  90. plt.bar(dataset['Name'], dataset['Height'], color=colors[:6])
  91. plt.xlabel('Name')
  92. plt.xticks(rotation=90)
  93. plt.ylabel('Height')
  94. plt.title('Barplot')
  95. plt.show()
  96.  
  97. #Bar Plot in Seaborn with sns.barplot()
  98. plt.figure(figsize=(10,5), tight_layout=True)
  99. ax = sns.barplot(x=dataset['Name'], y=dataset['Age'], palette='pastel', ci=None)
  100. ax.set(title='Barplot with Seaborn', xlabel='Names', ylabel='Age')
  101. plt.xticks(rotation=90)
  102. plt.show()
  103.  
  104. #Barplot grouped data by "n" variables
  105. plt.figure(figsize=(12, 6), tight_layout=True)
  106. ax = sns.barplot(x=dataset['Age'], y=dataset['Height'], hue=dataset['Name'], palette='pastel')
  107. ax.set(title='Age vs Height' ,xlabel='Age', ylabel='Height')
  108. ax.legend(title='Names', title_fontsize='13', loc='upper right')
  109. plt.show()
  110.  
  111. #Histograms with plt.hist() or sns.histplot()
  112. plt.figure(figsize=(10,6), tight_layout=True)
  113. bins = [40, 50, 60, 70, 80]
  114. # matplotlib
  115. plt.hist(dataset['Height'], bins=bins, color=sns.color_palette('Set2')[2], linewidth=2)
  116. plt.title('Histogram')
  117. plt.xlabel('Height (cm)')
  118. plt.ylabel('Count')
  119. # seaborn
  120. ax = sns.histplot(data=dataset, x='Height', bins=bins, color=sns.color_palette('Set2')[2], linewidth=2)
  121. ax.set(title='Histogram', xlabel='Height (cm)', ylabel='Count')
  122. plt.show()
  123.  
  124. #Boxplot
  125. plt.figure(figsize=(10,6), tight_layout=True)
  126. ax = sns.boxplot(data=dataset, x='Name', y='Age', palette='Set2', linewidth=2.5)
  127. ax.set(title='Boxplot', xlabel='Names', ylabel='Age (Years)')
  128. plt.xticks(rotation=90)
  129. plt.show()
  130.  
  131. #Scatter plot
  132. plt.figure(figsize=(10,6), tight_layout=True)
  133. ax = sns.scatterplot(data=dataset, x='Age', y='Height',   hue='Name', palette='Set2', s=60)
  134. ax.set(xlabel='Age (Years)', ylabel='Height (cms)')
  135. ax.legend(title='People', title_fontsize = 12)
  136. plt.show()
  137.  
  138. #Something else
  139. pivot = dataset.groupby(['Name'], as_index=False).mean()
  140. relationship = pivot.loc[:,"Age":"Height"]
  141. print(relationship)
  142.  
  143. #Plot some graph
  144. charts = ["bar", "line", "barh", "hist", "box", "kde", "density", "area"]
  145. for chart_type in charts:
  146.     relationship.plot(kind="%s" % chart_type) #Replace bar with line, barh, hist, box, kde, density, area
  147.     plt.title("%s plot" % chart_type)
  148.     plt.show()
  149.  
  150. #Seaborn
  151. plt.figure()
  152. sns.set_style("darkgrid")
  153. sns.lineplot(data = dataset, x = "Age", y = "Height")
  154. plt.show()
  155.  
  156. plt.figure()
  157. sns.set_style("darkgrid")
  158. sns.lineplot(data = dataset, x = "Height", y = "Age")
  159. plt.show()
  160.  
  161. #replot
  162. plt.figure()
  163. sns.set_theme(style="darkgrid")
  164. sns.relplot(x="Age", y="Height", hue="Name", data=dataset);
  165. plt.show()
  166.  
  167. #Hexbin
  168. #Split the plotting window into 20 hexbins
  169. plt.figure()
  170. nbins = 15
  171. plt.title('Hexbin')
  172. plt.hexbin(dataset["Age"], dataset["Height"], gridsize=nbins, color=colors[:5])
  173. plt.show()
  174.  
  175. #Hexbin 2
  176. #Split the plotting window into 20 hexbins
  177. plt.figure()
  178. nbins = 15
  179. plt.title('Hexbin')
  180. plt.hexbin(dataset["Age"], dataset["Height"], gridsize=nbins, cmap=plt.cm.BuGn_r)
  181. plt.show()
  182.  
  183. #2-D Hist
  184. plt.figure()
  185. plt.title('2-D Histogram')
  186. plt.hist2d(dataset["Age"], dataset["Height"], bins=nbins, color=colors[:6])
  187. plt.show()
  188.  
  189. #2-D Hist 2
  190. plt.figure()
  191. plt.title('2-D Histogram')
  192. plt.hist2d(dataset["Age"], dataset["Height"], bins=nbins, cmap=plt.cm.BuGn_r)
  193. plt.show()
  194.  
  195. #Set variables
  196. x = dataset["Age"]
  197. y = dataset["Height"]
  198. z = dataset["Name"]
  199.  
  200. #Linear Regression
  201. plt.figure()
  202. sns.regplot(x = x, y = y, data=dataset);
  203. plt.show()
  204.  
  205. plt.figure()
  206. sns.jointplot(x=x, y=y, data=dataset, kind="reg");
  207. plt.show()
  208.  
  209. #Set seaborn style
  210. sns.set_style("white")
  211.  
  212. # Basic 2D density plot
  213. plt.figure()
  214. sns.kdeplot(x=x, y=y)
  215. plt.show()
  216.  
  217. # Custom the color, add shade and bandwidth
  218. plt.figure()
  219. sns.kdeplot(x=x, y=y, cmap="Reds", shade=True, bw_adjust=.5)
  220. plt.show()
  221.  
  222. # Add thresh parameter
  223. plt.figure()
  224. sns.kdeplot(x=x, y=y, cmap="Blues", shade=True, thresh=0)
  225. plt.show()
  226.  
  227. #Joint plot
  228. plt.figure()
  229. sns.jointplot(x = x,y = y,data = dataset,kind = 'hex')
  230. plt.show()

0
TSSFL -- A Creative Journey Towards Infinite Possibilities!
Post Reply

Return to “Python Programming”

  • Information
  • Who is online

    Users browsing this forum: No registered users and 18 guests