search-solved-public-seo/ecommerce/automatic-category-suggester/category_generator.py at main · admariner/search-solved-public-seo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
####################################################################################
#                                                                                  #
#  Automatic Category Page Suggester                                               #
#                                                                                  #
#  Analyze crawl data to suggest new category pages based on product inventory.    #
#                                                                                  #
####################################################################################
# Author   : Lee Foot                                                              #
# Website  : https://www.leefoot.com                                               #
# Contact  : https://www.leefoot.com/contact                                       #
# Email    : hello@leefoot.com                                                     #
# LinkedIn : https://www.linkedin.com/in/lee-foot/                                 #
# Bluesky  : https://bsky.app/profile/leefootseo.bsky.social                                              #
####################################################################################

import streamlit as st
import collections
import re
import string
import chardet
from polyfuzz import PolyFuzz

import pandas as pd
import requests
from nltk.util import ngrams
from stqdm import stqdm

st.set_page_config(
    page_title="Automatic Category Page Suggester by LeeFootSEO",
    page_icon="chart_with_upwards_trend",
    layout="wide",
)

# -------------------------------- Sidebar Configuration --------------------------------
st.sidebar.title("Settings")

# API Configuration
st.sidebar.header("Keywords Everywhere API")
kwe_key = st.sidebar.text_input(
    'API Key',
    type="password",
    help="Get your API key from keywordseverywhere.com"
)
country_kwe = st.sidebar.selectbox(
    'Country',
    ('us', 'uk', 'au', 'ca', 'in', 'nz', 'za'),
    index=0,
    help="Country for search volume data"
)
currency_kwe = st.sidebar.selectbox(
    'Currency',
    ('usd', 'gbp', 'eur', 'aud', 'cad', 'inr', 'nzd'),
    index=0,
    help="Currency for CPC data"
)

# Filtering Options
st.sidebar.header("Filtering")
min_search_vol = st.sidebar.number_input('Minimum Search Volume', value=100, min_value=0)
min_cpc = st.sidebar.slider("Minimum CPC ($)", value=0, min_value=0, max_value=50)
min_product_match_exact = st.sidebar.slider("Min Products (Exact Match)", value=3, min_value=1, max_value=100)
min_product_match_fuzzy = st.sidebar.slider("Min Products (Fuzzy Match)", value=3, min_value=1, max_value=100)
min_sim_match = st.sidebar.slider(
    "Max Similarity to Existing Category",
    value=96,
    min_value=0,
    max_value=100,
    help="Lower = more unique suggestions"
)

# Advanced Options
st.sidebar.header("Advanced")
keep_longest_word = st.sidebar.checkbox('Keep Longest Word Only', value=True, help="Remove shorter keyword fragments")
enable_fuzzy_product_match = st.sidebar.checkbox('Enable Fuzzy Matching (Slower)', value=False)


# -------------------------------- Main Content --------------------------------
st.title("Automatic Category Page Suggester")
st.markdown("*Created by* [![Website](https://img.shields.io/badge/-leefoot.com-2A9D8F?logoColor=white)](https://www.leefoot.com) · [![Hire Me](https://img.shields.io/badge/-Hire%20Me-FF6B6B?logoColor=white)](https://www.leefoot.com/contact) · [![LinkedIn](https://img.shields.io/badge/-LinkedIn-0A66C2?logo=linkedin&logoColor=white)](https://www.linkedin.com/in/lee-foot/) · [![Bluesky](https://img.shields.io/badge/-Bluesky-0285FF?logoColor=white)](https://bsky.app/profile/leefootseo.bsky.social) · [![More Tools](https://img.shields.io/badge/-More%20Tools-8B5CF6?logoColor=white)](https://leefoot.com/tools) · [![GitHub](https://img.shields.io/badge/-GitHub-6B7280?logoColor=white)](https://github.com/searchsolved/search-solved-public-seo)")

with st.expander("How to use this tool"):
    st.markdown("""
    **What this tool does:**
    - Suggests new category pages based on product data
    - Analyzes search demand for category opportunities
    - Identifies gaps in your category taxonomy

    **How to use:**
    1. Upload product feed CSV with titles/categories
    2. Upload search demand data (optional)
    3. Configure category generation settings
    4. Review suggested new categories

    **Best for:**
    - Ecommerce taxonomy optimization
    - Category page creation strategy
    - Search demand-driven IA improvements
    """)
st.markdown("*Discover new category page opportunities based on your product inventory and real search demand.*")
st.markdown("")

# Instructions in an expander (expanded by default)
with st.expander("How to Use This Tool", expanded=False):
    st.markdown("""
    ### Prerequisites
    You need **two exports from Screaming Frog** with custom extraction configured:

    **1. Set up Custom Extraction in Screaming Frog:**
    - Configuration > Custom > Extraction
    - Create extractions to identify **Product Pages** and **Category Pages**
    - Example: Extract a unique element that only appears on product pages

    **2. Export the required files:**

    | File | How to Export |
    |------|---------------|
    | `inlinks.csv` | Right-click any product URL > Export > Inlinks |
    | `internal_html.csv` | Bulk Export > All > Internal HTML |

    ### How It Works
    1. **N-grams** extract 2-7 word phrases from your product H1 tags
    2. Phrases are **matched to products** to ensure inventory coverage
    3. **PolyFuzz** checks similarity to existing categories (removes duplicates)
    4. **Keywords Everywhere** validates real search volume (optional but recommended)

    ### Tips
    - The Keywords Everywhere API key is optional but highly recommended
    - Without it, you'll get all n-gram combinations (many won't have search volume)
    - Get your API key at [keywordseverywhere.com](https://keywordseverywhere.com/)
    """)

st.markdown("---")

# File upload section
st.subheader("Step 1: Upload Your Screaming Frog Exports")

min_search_vol = int(min_search_vol)
data_source_kwe = 'gkp'  # gkp = google keyword planner only // cli = clickstream data + keyword planner

http_or_https_gsc = "https://"  # http prefix // default = https://
parms = "page=|p=|utm_medium|sessionid|affiliateid|sort=|order=|type=|categoryid=|itemid=|viewItems=|query" \
        "=|search=|lang="  # drop common parameter urls

# -------------------------------- File Upload Section ----------------------------------
col1, col2 = st.columns(2)

with col1:
    st.markdown("**Inlinks Export**")
    uploaded_file = st.file_uploader(
        "Upload inlinks.csv",
        type=['csv'],
        help="Right-click a product URL in Screaming Frog > Export > Inlinks"
    )

    if uploaded_file is not None:
        try:
            result = chardet.detect(uploaded_file.getvalue())
            encoding_value = result["encoding"]

            if encoding_value == "UTF-16":
                white_space = True
            else:
                white_space = False

            df_all_inlinks = pd.read_csv(
                uploaded_file,
                encoding=encoding_value,
                delim_whitespace=white_space,
                dtype=str,
                on_bad_lines='skip',
            )
            number_of_rows = len(df_all_inlinks)
            if number_of_rows == 0:
                st.error("File is empty!")
            else:
                st.success(f"Loaded {number_of_rows:,} rows")

        except UnicodeDecodeError:
            st.error("Could not read file. Check encoding.")
    else:
        st.info("Waiting for inlinks.csv...")
        st.stop()

with col2:
    st.markdown("**Internal HTML Export**")
    uploaded_crawl_file = st.file_uploader(
        "Upload internal_html.csv",
        type=['csv'],
        help="Screaming Frog > Bulk Export > All > Internal HTML"
    )

    if uploaded_crawl_file is not None:
        try:
            result = chardet.detect(uploaded_crawl_file.getvalue())
            encoding_value = result["encoding"]
            if encoding_value == "UTF-16":
                white_space = True
            else:
                white_space = False

            df_internal_html = pd.read_csv(
                uploaded_crawl_file,
                encoding=encoding_value,
                delim_whitespace=white_space,
                on_bad_lines='skip',
                dtype=str,
            )

            number_of_rows = len(df_internal_html)
            if number_of_rows == 0:
                st.error("File is empty!")
            else:
                st.success(f"Loaded {number_of_rows:,} rows")

        except UnicodeDecodeError:
            st.error("Could not read file. Check encoding.")
    else:
        st.info("Waiting for internal_html.csv...")
        st.stop()

# -------------------------------- Column Mapping Section ----------------------------------
st.markdown("---")
st.subheader("Step 2: Map Your Custom Extraction Columns")
st.markdown("Select the columns from your Screaming Frog custom extraction that identify product and category pages.")

col3, col4 = st.columns(2)

with col3:
    product_extract_col = st.selectbox(
        'Product Page Column',
        df_internal_html.columns,
        help="Column that identifies product pages (from custom extraction)"
    )

with col4:
    category_extract_col = st.selectbox(
        'Category Page Column',
        df_internal_html.columns,
        help="Column that identifies category pages (from custom extraction)"
    )

st.markdown("---")
st.subheader("Step 3: Generate Category Suggestions")
submitted_crawl_btn = st.button('Generate Suggestions', type='primary', use_container_width=True)

# ------------------------------------- start rest of code -------------------------------------------------------
if submitted_crawl_btn == True:

    # ------------------------------------- clean up the crawl files -------------------------------------------------------

    df_all_inlinks = df_all_inlinks.rename(columns={"From": "Source", "To": "Destination"})

    df_internal_html = df_internal_html[
        ~df_internal_html["Indexability"].isin(["Non-Indexable"])]  # keep indexable urls
    df_internal_html['H1-1'] = df_internal_html['H1-1'].str.lower()
    df_internal_html['H1-1'] = df_internal_html['H1-1'].str.encode('ascii', 'ignore').str.decode('ascii')
    df_internal_html['Title-1'] = df_internal_html['Title 1'].str.encode('ascii', 'ignore').str.decode('ascii')

    # ---------------------------------- work out the page type from extractors --------------------------------------------

    df1 = df_internal_html[df_internal_html[product_extract_col].notna()].copy()
    df2 = df_internal_html[df_internal_html[category_extract_col].notna()].copy()
    df1.rename(columns={product_extract_col: "Page Type"}, inplace=True)
    df2.rename(columns={category_extract_col: "Page Type"}, inplace=True)
    df1["Page Type"] = "Product Page"
    df2["Page Type"] = "Category Page"
    df_internal_html = pd.concat([df1, df2])

    # ------------------------------------ extract the domain from the crawl -----------------------------------------------

    extracted_domain = df_internal_html["Address"]
    extracted_domain = extracted_domain.str.split("/").str[2]
    url = extracted_domain.iloc[0]
    url_slash = http_or_https_gsc + url + "/"  # adds a trailing slash to the domain to query the gsc api

    # -------------------------------- make the products & category dataframes ---------------------------------------------

    df_sf_products = df_internal_html[df_internal_html['Page Type'].str.contains("Product Page")].copy()
    df_sf_categories = df_internal_html[df_internal_html['Page Type'].str.contains("Category Page")].copy()

    df_sf_products.drop_duplicates(subset="H1-1", inplace=True)  # drop duplicate values (drop pagination pages etc)
    df_sf_categories.drop_duplicates(subset="H1-1", inplace=True)  # drop duplicate values (drop pagination pages etc)
    df_sf_categories = df_sf_categories[~df_sf_categories["Address"].str.contains(parms, na=False)]
    df_all_inlinks.drop_duplicates(subset=["Source", "Destination"], keep="first", inplace=True)
    df_all_inlinks = pd.merge(df_all_inlinks, df_sf_categories, left_on="Source", right_on="Address", how="left")
    df_all_inlinks = df_all_inlinks[df_all_inlinks["Page Type"].isin(["Category Page"])]

    df_all_inlinks = df_all_inlinks[["Destination", "Source"]]

    df_sf_products = pd.merge(df_sf_products, df_all_inlinks, left_on="Address", right_on="Destination", how="left")
    df_sf_products.rename(columns={"Source": "Parent URL", "Address": "Product URL"}, inplace=True)
    df_sf_products = df_sf_products[df_sf_products["Parent URL"].notna()]  # Only Keep Rows which are not NaN

    # ---------------------------- group dataframes & make lists for n-gramming --------------------------------------------

    df_product_group = (df_sf_products.groupby("Product URL").agg({"Parent URL": "first"}).reset_index())

    category_extractor_list = list(set(df_product_group["Parent URL"]))
    len_product_list = len(category_extractor_list)

    # ---------------------------------------- start n-gram routine --------------------------------------------------------

    ngram_loop_count = 1
    start_num = 0
    appended_data = []

    for i in stqdm(range(0, ngram_loop_count)):
        print("Calculating ngrams")
        while ngram_loop_count != len_product_list:
            df_kwe = df_sf_products[
                df_sf_products["Parent URL"].str.contains(category_extractor_list[start_num], na=False)]
            text = str(df_kwe["H1-1"])

            # clean up the corpus before ngramming
            text = "".join(c for c in text if not c.isdigit())  # removes all numbers
            text = re.sub("<.*>", "", text)
            text = re.sub(r"\b[a-zA-Z]\b", "", text)
            punctuationNoFullStop = "[" + re.sub("\.", "", string.punctuation) + "]"
            text = re.sub(punctuationNoFullStop, "", text)

            # first get individual words
            tokenized = text.split()
            oneNgrams = ngrams(tokenized, 1)
            twoNgrams = ngrams(tokenized, 2)
            threeNgrams = ngrams(tokenized, 3)
            fourNgrams = ngrams(tokenized, 4)
            fiveNgrams = ngrams(tokenized, 5)
            sixNgrams = ngrams(tokenized, 6)
            sevenNgrams = ngrams(tokenized, 7)
            oneNgramsFreq = collections.Counter(oneNgrams)
            twoNgramsFreq = collections.Counter(twoNgrams)
            threeNgramsFreq = collections.Counter(threeNgrams)
            fourNgramsFreq = collections.Counter(fourNgrams)
            fiveNgramsFreq = collections.Counter(fiveNgrams)
            sixNgramsFreq = collections.Counter(sixNgrams)
            sevenNgramsFreq = collections.Counter(sevenNgrams)

            # combines the above collection counters so they can be placed in a dataframe.
            ngrams_combined_list = (
                    twoNgramsFreq.most_common(100)
                    + threeNgramsFreq.most_common(100)
                    + fourNgramsFreq.most_common(100)
                    + fiveNgramsFreq.most_common(100)
                    + sixNgramsFreq.most_common(100)
                    + sevenNgramsFreq.most_common(100)
            )

            # create the final dataframe
            df_ngrams = pd.DataFrame(ngrams_combined_list, columns=["Keyword", "Frequency"])
            df_ngram_frequency = pd.DataFrame(ngrams_combined_list, columns=["Keyword", "Frequency"])
            df_ngrams["Parent Category"] = category_extractor_list[start_num]
            data = df_ngrams
            appended_data.append(data)
            start_num = start_num + 1
            ngram_loop_count = ngram_loop_count + 1

    df_ngrams = pd.concat(appended_data)  # concat the list of dataframes
    ngram_count = df_ngrams.shape[0]  # get the row count
    st.info(f'Total keywords generated via n-grams: {ngram_count:,}')

    df_ngrams = df_ngrams.sort_values(by="Frequency", ascending=False)
    df_ngrams["Keyword"] = [' '.join(entry) for entry in df_ngrams["Keyword"]]
    cols = "Parent Category", "Keyword", "Frequency"
    df_ngrams = df_ngrams.reindex(columns=cols)

    # ---------------------------------------- pre-filtering ---------------------------------------------------------------

    df_ngrams = df_ngrams[~df_ngrams['Keyword'].astype(str).str.startswith(("and", "with", "for", "mm ", "cm ", "of"))]
    df_ngrams = df_ngrams[~df_ngrams['Keyword'].astype(str).str.endswith(("and", "with", "for", "mm ", "cm ", "of"))]

    df_sf_products["H1-1"] = df_sf_products["H1-1"].astype(str)
    df_sf_products["H1-1"] = df_sf_products["H1-1"].str.lower()

    df_product_set = set(df_sf_products["H1-1"])  # make a set, then a list
    target_keyword_list = list(df_product_set)
    keyword_list = list(df_ngrams["Keyword"])  # make the keyword list

    # ---------------------- keep only suggestions which match to products x times -----------------------------------------

    st.write(f"Exact matching to a minimum of {min_product_match_exact} products...")
    check_list_exact = []
    for i in stqdm(keyword_list):
        check_freq = sum(i in s for s in target_keyword_list)
        check_list_exact.append(check_freq)

    # search in a fuzzy match
    if enable_fuzzy_product_match:
        st.write(f"Fuzzy matching keywords to a minimum of {min_product_match_fuzzy} products...")
        check_list_fuzzy = []
        for keywords in stqdm(keyword_list):
            check_list_fuzzy.append(
                sum(all(keyword in target for keyword in keywords.split())
                    for target in target_keyword_list)
            )

    df_ngrams["Matching Products (Exact)"] = check_list_exact

    if enable_fuzzy_product_match:
        df_ngrams["Matching Products (Fuzzy)"] = check_list_fuzzy

    df_ngrams = df_ngrams[df_ngrams["Matching Products (Exact)"] >= min_product_match_exact]

    if enable_fuzzy_product_match:
        df_ngrams = df_ngrams[df_ngrams["Matching Products (Fuzzy)"] >= min_product_match_fuzzy]

    rows = df_ngrams.shape[0]
    ngram_loop_count = 1
    start = 1
    end = 100
    df_data = []

    # ------------------------ fuzz match suggested keywords to existing categories ----------------------------------------

    df_ngrams = df_ngrams[df_ngrams["Keyword"].notna()]  # Only Keep Rows which are not NaN
    df_sf_categories = df_sf_categories[df_sf_categories["H1-1"].notna()]  # Only Keep Rows which are not NaN
    df_keyword_list = list(df_ngrams["Keyword"])  # create lists from dfs
    df_sf_cats_list = list(df_sf_categories["H1-1"])  # create lists from dfs
    model = PolyFuzz("TF-IDF").match(df_keyword_list, df_sf_cats_list)  # do the matching

    st.write("Checking if suggestions match to an existing category ..")
    df_fuzz = model.get_matches()  # make the polyfuzz dataframes
    df_ngrams = pd.merge(df_ngrams, df_fuzz, left_on="Keyword", right_on="From")
    df_ngrams.rename(columns={"To": "Matched Category", "clicks": "Clicks", "impressions": "Impressions"},
                     inplace=True)

    if kwe_key != "":
        # -------------------------- check available keywords everywhere credits------------------------------------------------

        df_ngrams.drop_duplicates(subset=["Keyword"], keep="first", inplace=True)
        creds_required = df_ngrams.shape[0]

        my_headers = {
            'Accept': 'application/json',
            'Authorization': 'Bearer ' + kwe_key
        }
        response = requests.get('https://api.keywordseverywhere.com/v1/account/credits', headers=my_headers)
        if response.status_code == 200:
            creds_available = response.content.decode('utf-8')
            creds_available = creds_available.split()
            creds_available = int(creds_available[1])
            st.write(f"This operation will require {creds_required:,} Keywords Everywhere API credits. You have {creds_available:,} credits remaining.")
            if creds_available < creds_required:
                st.write("Not enough keywords everywhere credits available!")
                st.stop
        else:
            st.write("An error occurred\n\n", response.content.decode('utf-8'))

        # ---------------------- get search volume with keywords everywhere -----------------------------------------------------

        loops = int(creds_required / 100)
        if loops == 1:
            loops += 1
        fixed_loops = loops * 100  # fixes the total loop counter displayed value
        ngram_loop_count_100 = ngram_loop_count * 100

        st.write("Fetching search volume & CPC data from Keywords Everywhere...")
        for i in stqdm(range(0, loops)):

            while ngram_loop_count != loops:
                keywords = list(df_ngrams["Keyword"][start:end])
                keywords_set = set(keywords)
                keywords = list(keywords_set)
                my_data = {
                    'country': country_kwe,
                    'currency': currency_kwe,
                    'dataSource': data_source_kwe,
                    'kw[]': keywords
                }
                my_headers = {
                    'Accept': 'application/json',
                    'Authorization': 'Bearer ' + kwe_key
                }
                response = requests.post(
                    'https://api.keywordseverywhere.com/v1/get_keyword_data', data=my_data, headers=my_headers)
                try:
                    keywords_data = response.json()['data']
                except KeyError:
                    print("Couldn't retrieve data from Keywords Everywhere. Check credits...")
                    pass

                vol = []
                cpc = []

                for element in keywords_data:
                    vol.append(element["vol"])
                    cpc.append(element["cpc"]["value"])

                rows = zip(keywords, vol, cpc)
                df_kwe = pd.DataFrame(rows, columns=["Keyword", "Search Volume", "CPC"])
                data = df_kwe
                df_data.append(data)
                start = start + 100
                end = end + 100
                ngram_loop_count += +1
                ngram_loop_count_100 += 100

        st.success("Got search volume and CPC data successfully!")

        df_kwe = pd.concat(df_data)
        df_kwe["Search Volume"] = df_kwe["Search Volume"].astype(int)
        df_kwe["CPC"] = df_kwe["CPC"].astype(float)
        df_kwe = df_kwe[df_kwe["Search Volume"] > min_search_vol]
        df_kwe = df_kwe[df_kwe["CPC"] > min_cpc]
        df_kwe = pd.merge(df_kwe, df_ngrams, on="Keyword", how='left')
        df_kwe = df_kwe.sort_values(by="Parent Category", ascending=True)

        # ------------------------------------- clean up the final dataframe ---------------------------------------------------

        df_kwe["Similarity"] = df_kwe["Similarity"] * 100
        df_kwe.fillna({"Similarity": 0}, inplace=True)
        df_kwe["Similarity"] = df_kwe["Similarity"].astype(int)
        df_kwe = df_kwe[df_kwe["Similarity"] <= min_sim_match]
        df_kwe["Matched Category"] = df_kwe["Matched Category"].str.lower()
        df_kwe.drop_duplicates(subset=["Matched Category", "Keyword"], keep="first", inplace=True)
    else:
        df_kwe = df_ngrams

    cols = (
        "Parent Category",
        "Keyword",
        "Search Volume",
        "CPC",
        "Matching Products (Exact)",
        "Matching Products (Fuzzy)",
        "Similarity",
        "Matched Category",
    )
    df_kwe = df_kwe.reindex(columns=cols)

    # --------------------------- keep the longest word and discard the fragments ------------------------------------------

    if keep_longest_word == True:
        st.write("Keeping longest word and discarding fragments...")

        list1 = df_kwe["Keyword"]
        substrings = {w1 for w1 in list1 for w2 in list1 if w1 in w2 and w1 != w2}
        longest_word = set(list1) - substrings
        longest_word = list(longest_word)
        shortest_word_list = list(set(list1) - set(longest_word))
        with st.expander(f"View {len(shortest_word_list)} discarded fragments"):
            st.write(shortest_word_list)
        df_kwe = df_kwe[~df_kwe['Keyword'].isin(shortest_word_list)]

    # ------------------------------ merge in page title for matched category ----------------------------------------------

    df_mini = df_internal_html[["H1-1", "Title 1"]]
    df_mini = df_mini.rename(columns={"H1-1": "Matched Category", "Title 1": "Matched Category Page Title"})
    df_kwe = pd.merge(df_kwe, df_mini[['Matched Category', 'Matched Category Page Title']], on='Matched Category',
                      how='left')

    # ---------------------- remove keyword suggestions if matched to an existing category in any order --------------------

    df_kwe['Matched Category Page Title Lower'] = df_kwe['Matched Category Page Title'].str.lower()
    df_kwe = df_kwe.astype({"Keyword": "str", "Matched Category": "str", "Matched Category Page Title Lower": "str"})
    col = "Keyword"


    def ismatch(s):
        A = set(s[col].split())
        B = set(s['Matched Category Page Title Lower'].split())
        return A.intersection(B) == A


    df_kwe['KW Matched'] = df_kwe.apply(ismatch, axis=1)

    # --------------------------------------------- handling pluralised words ------------------------------------------

    df_kwe["Keyword + s"] = df_kwe["Keyword"] + "s"  # make new temp column to run the same check on the pluralised word
    col = "Keyword + s"  # updates the column to run function on
    df_kwe['KW Matched 2'] = df_kwe.apply(ismatch, axis=1)
    df_kwe = df_kwe[~df_kwe["KW Matched"].isin([True])]  # drop rows which are matched
    df_kwe = df_kwe[~df_kwe["KW Matched 2"].isin([True])]  # drop rows which are matched
    df_kwe.drop_duplicates(subset=["Parent Category", "Keyword"], keep="first",
                           inplace=True)  # drop if both values dupes

    # ---------------------- Set the Final Column Order ------------------------------------------------------------------

    cols = (
        "Parent Category",
        "Keyword",
        "Search Volume",
        "CPC",
        "Matching Products (Exact)",
        "Matching Products (Fuzzy)",
        "Matched Category",
        "Similarity",
        "Matched Category Page Title",
    )
    df_kwe = df_kwe.reindex(columns=cols)

    if enable_fuzzy_product_match == False:
        del df_kwe['Matching Products (Fuzzy)']
    if kwe_key == "":
        del df_kwe['Search Volume']
        del df_kwe['CPC']

    # ---------------------- Export Final Dataframe to CSV -------------------------------------------------------------

    keyword_volume_count = df_kwe.shape[0]
    df_kwe.sort_values(["Parent Category", "Keyword"], ascending=[True, True], inplace=True)

    st.markdown("---")
    st.subheader("Results")
    st.success(f"Found {keyword_volume_count:,} category suggestions!")

    # Show preview
    st.dataframe(df_kwe, use_container_width=True)

    @st.cache_data
    def convert_df(df):  # IMPORTANT: Cache the conversion to prevent computation on every rerun
        return df.to_csv(index=False).encode('utf-8')

    csv = convert_df(df_kwe)
    st.download_button(
        label="Download Category Suggestions (CSV)",
        data=csv,
        file_name='category_opportunities.csv',
        mime='text/csv',
        type='primary',
        use_container_width=True
    )