forked from searchsolved/search-solved-public-seo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpaa_scraper.py
More file actions
325 lines (267 loc) · 10.9 KB
/
paa_scraper.py
File metadata and controls
325 lines (267 loc) · 10.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""
People Also Ask (PAA) Scraper - Streamlit App
Recursively extracts PAA questions from search results using ValueSERP API.
Author: Lee Foot
Website: https://www.leefoot.com
"""
import streamlit as st
import pandas as pd
import requests
import json
import time
from datetime import datetime
import io
st.set_page_config(
page_title="People Also Ask Scraper",
page_icon="❓",
layout="wide"
)
st.title("❓ People Also Ask (PAA) Scraper")
st.markdown("*Created by* [](https://www.leefoot.com) · [](https://www.leefoot.com/contact) · [](https://www.linkedin.com/in/lee-foot/) · [](https://bsky.app/profile/leefootseo.bsky.social) · [](https://leefoot.com/tools) · [](https://github.com/searchsolved/search-solved-public-seo)")
st.markdown("Extract 'People Also Ask' questions recursively using the ValueSERP API.")
with st.expander("How to use this tool"):
st.markdown("""
**What this tool does:**
- Recursively extracts PAA questions from Google search results
- Expands each PAA to find deeper question layers
- Captures answer snippets and source URLs
**How to use:**
1. Get a ValueSERP API key from [valueserp.com](https://www.valueserp.com/)
2. Enter your API key in the sidebar
3. Configure search settings (location, language, device)
4. Enter seed keywords (one per line)
5. Click "Extract PAA Questions"
**Output columns:**
- **original_query**: Your seed keyword
- **level**: Depth (1 = direct PAA, 2 = PAA of PAA, etc.)
- **question**: The PAA question text
- **answer_snippet**: The snippet answer shown in search
- **source_url/title**: The answer source
**Best for:**
- Building comprehensive FAQ pages
- Discovering content topic ideas
- Finding featured snippet opportunities
- Understanding user search intent
""")
# Sidebar configuration
with st.sidebar:
st.header("API Configuration")
api_key = st.text_input(
"ValueSERP API Key",
type="password",
help="Get your API key from https://www.valueserp.com/"
)
st.markdown("---")
st.header("Search Settings")
# Location settings
location = st.selectbox(
"Location",
options=[
"United States", "United Kingdom", "Canada", "Australia",
"Germany", "France", "Spain", "Italy", "Netherlands",
"Brazil", "Mexico", "India", "Japan"
],
index=0,
help="Location for search results"
)
country_codes = {
"United States": ("us", "google.com"),
"United Kingdom": ("uk", "google.co.uk"),
"Canada": ("ca", "google.ca"),
"Australia": ("au", "google.com.au"),
"Germany": ("de", "google.de"),
"France": ("fr", "google.fr"),
"Spain": ("es", "google.es"),
"Italy": ("it", "google.it"),
"Netherlands": ("nl", "google.nl"),
"Brazil": ("br", "google.com.br"),
"Mexico": ("mx", "google.com.mx"),
"India": ("in", "google.co.in"),
"Japan": ("jp", "google.co.jp")
}
language = st.selectbox(
"Language",
options=["en", "de", "fr", "es", "it", "nl", "pt", "ja"],
index=0,
help="Language for search results"
)
device = st.selectbox(
"Device",
options=["Desktop", "Mobile", "Tablet"],
index=0
)
st.markdown("---")
st.header("Scrape Settings")
max_depth = st.slider(
"Max Depth",
min_value=1,
max_value=5,
value=2,
help="How many levels deep to follow PAA questions"
)
request_delay = st.slider(
"Request Delay (seconds)",
min_value=0.0,
max_value=5.0,
value=0.5,
step=0.5,
help="Delay between API requests"
)
def get_related_questions(query, api_key, location, country_code, google_domain,
language, device, max_depth, delay,
level=1, all_questions=None, parent=None,
original_query=None, progress_callback=None):
"""Recursively fetch related questions from ValueSERP API."""
if all_questions is None:
all_questions = []
original_query = query
if level > max_depth:
return all_questions
if progress_callback:
progress_callback(f"Level {level}: Querying '{query[:50]}...'")
params = {
'api_key': api_key,
'q': query,
'gl': country_code,
'hl': language,
'location': location,
'google_domain': google_domain,
'device': device.lower(),
'output': 'json',
'page': '1',
'num': '10',
'include_fields': 'related_questions'
}
try:
response = requests.get('https://api.valueserp.com/search', params=params, timeout=30)
response.raise_for_status()
data = response.json()
questions = data.get('related_questions', [])
if not questions:
return all_questions
for q in questions:
question_text = q.get('question', '')
if not question_text:
continue
question_data = {
'original_query': original_query,
'level': level,
'parent_query': parent if parent else query,
'question': question_text,
'answer_snippet': q.get('answer', {}).get('text', '') if isinstance(q.get('answer'), dict) else '',
'source_url': q.get('answer', {}).get('link', '') if isinstance(q.get('answer'), dict) else '',
'source_title': q.get('answer', {}).get('title', '') if isinstance(q.get('answer'), dict) else ''
}
# Check for duplicates
if not any(d.get('question') == question_text for d in all_questions):
all_questions.append(question_data)
# Recursively query next level
if level < max_depth:
time.sleep(delay)
get_related_questions(
question_text,
api_key, location, country_code, google_domain,
language, device, max_depth, delay,
level=level + 1,
all_questions=all_questions,
parent=question_text,
original_query=original_query,
progress_callback=progress_callback
)
except requests.exceptions.RequestException as e:
if progress_callback:
progress_callback(f"Error: {str(e)}")
return all_questions
# Main app
st.subheader("Enter Keywords")
keywords_input = st.text_area(
"Keywords (one per line)",
height=150,
placeholder="Enter your seed keywords, one per line:\n\nwhat is SEO\nhow to rank on Google\nbest keyword research tools"
)
col1, col2 = st.columns([1, 3])
with col1:
run_button = st.button("Extract PAA Questions", type="primary", disabled=not api_key)
if not api_key:
st.warning("Please enter your ValueSERP API key in the sidebar.")
if run_button and keywords_input and api_key:
keywords = [k.strip() for k in keywords_input.strip().split('\n') if k.strip()]
if not keywords:
st.error("Please enter at least one keyword.")
else:
country_code, google_domain = country_codes[location]
all_results = []
progress_bar = st.progress(0)
status_text = st.empty()
def update_progress(message):
status_text.text(message)
for i, keyword in enumerate(keywords):
update_progress(f"Processing keyword {i+1}/{len(keywords)}: {keyword}")
results = get_related_questions(
keyword,
api_key,
location,
country_code,
google_domain,
language,
device,
max_depth,
request_delay,
progress_callback=update_progress
)
all_results.extend(results)
progress_bar.progress((i + 1) / len(keywords))
progress_bar.progress(100)
if all_results:
df = pd.DataFrame(all_results)
# Reorder columns
columns = ['original_query', 'level', 'parent_query', 'question',
'answer_snippet', 'source_url', 'source_title']
columns = [c for c in columns if c in df.columns]
df = df[columns]
st.success(f"Found {len(df):,} unique PAA questions!")
# Summary metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Questions", f"{len(df):,}")
with col2:
st.metric("Seed Keywords", len(keywords))
with col3:
level_counts = df['level'].value_counts().sort_index()
st.metric("Avg Questions/Keyword", f"{len(df)/len(keywords):.1f}")
with col4:
st.metric("Max Depth Used", df['level'].max())
# Questions by level
st.subheader("Questions by Level")
level_df = df['level'].value_counts().sort_index().reset_index()
level_df.columns = ['Level', 'Count']
st.bar_chart(level_df.set_index('Level'))
# Show data
st.subheader("All PAA Questions")
st.dataframe(df, use_container_width=True, height=400)
# Download options
st.subheader("Download Results")
col1, col2 = st.columns(2)
with col1:
csv = df.to_csv(index=False)
st.download_button(
"Download CSV",
csv,
file_name=f"paa_questions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
mime="text/csv"
)
with col2:
output = io.BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='PAA Questions')
excel_data = output.getvalue()
st.download_button(
"Download Excel",
excel_data,
file_name=f"paa_questions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.warning("No PAA questions found for the entered keywords.")
# Footer
st.markdown("---")