search-solved-public-seo/content-analysis/entity-extractor/entity_extractor.py at main · admariner/search-solved-public-seo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
"""
Entity Extractor - Extract Named Entities from Text/HTML using SpaCy NLP
Identify people, organizations, locations, and other entities in your content.

Author: Lee Foot
Date: January 2025
"""

import streamlit as st
import pandas as pd
from bs4 import BeautifulSoup
from io import BytesIO

st.set_page_config(
    page_title="Entity Extractor",
    page_icon="🔍",
    layout="wide"
)

# Check if spaCy is available
try:
    import spacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False

st.title("🔍 Entity Extractor")
st.markdown("*Created by* [![Website](https://img.shields.io/badge/-leefoot.com-2A9D8F?logoColor=white)](https://www.leefoot.com) · [![Hire Me](https://img.shields.io/badge/-Hire%20Me-FF6B6B?logoColor=white)](https://www.leefoot.com/contact) · [![LinkedIn](https://img.shields.io/badge/-LinkedIn-0A66C2?logo=linkedin&logoColor=white)](https://www.linkedin.com/in/lee-foot/) · [![Bluesky](https://img.shields.io/badge/-Bluesky-0285FF?logoColor=white)](https://bsky.app/profile/leefootseo.bsky.social) · [![More Tools](https://img.shields.io/badge/-More%20Tools-8B5CF6?logoColor=white)](https://leefoot.com/tools) · [![GitHub](https://img.shields.io/badge/-GitHub-6B7280?logoColor=white)](https://github.com/searchsolved/search-solved-public-seo)")

with st.expander("How to use this tool"):
    st.markdown("""
    **What this tool does:**
    - Extracts named entities from text content
    - Identifies people, places, organizations, concepts
    - Analyzes entity relationships and frequency

    **How to use:**
    1. Enter your API key (Google NLP or OpenAI)
    2. Paste text or upload content
    3. Click "Extract Entities"
    4. Review categorized entities

    **Best for:**
    - Content optimization for topical authority
    - Competitive entity analysis
    - Knowledge graph building
    """)
st.markdown("""
Extract named entities (people, organizations, locations) from text using SpaCy NLP.
Identify key topics and entities in your content for semantic SEO analysis.
""")

if not SPACY_AVAILABLE:
    st.error("""
    **SpaCy is not installed.** Please install it with:
    ```bash
    pip install spacy
    python -m spacy download en_core_web_sm
    ```
    """)
    st.stop()

# Sidebar configuration
st.sidebar.header("Configuration")

# Model selection
model_options = {
    'en_core_web_sm': 'Small (fast, less accurate)',
    'en_core_web_md': 'Medium (balanced)',
    'en_core_web_lg': 'Large (slow, most accurate)'
}

selected_model = st.sidebar.selectbox(
    "SpaCy Model",
    options=list(model_options.keys()),
    format_func=lambda x: f"{x} - {model_options[x]}",
    help="Larger models are more accurate but slower. You need to download the model first."
)

# Entity type filter
entity_types = {
    'PERSON': 'People, including fictional',
    'NORP': 'Nationalities, religious or political groups',
    'FAC': 'Buildings, airports, highways, bridges',
    'ORG': 'Companies, agencies, institutions',
    'GPE': 'Countries, cities, states',
    'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
    'PRODUCT': 'Objects, vehicles, foods (not services)',
    'EVENT': 'Named hurricanes, battles, wars, sports events',
    'WORK_OF_ART': 'Titles of books, songs, etc.',
    'LAW': 'Named documents made into laws',
    'LANGUAGE': 'Any named language',
}

excluded_types = ['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

selected_types = st.sidebar.multiselect(
    "Entity Types to Extract",
    options=list(entity_types.keys()),
    default=list(entity_types.keys()),
    format_func=lambda x: f"{x}: {entity_types[x]}",
    help="Select which entity types to extract"
)

# Load model
@st.cache_resource
def load_spacy_model(model_name):
    """Load spaCy model with caching."""
    try:
        return spacy.load(model_name)
    except OSError:
        return None

nlp = load_spacy_model(selected_model)

if nlp is None:
    st.error(f"""
    **Model '{selected_model}' is not installed.** Download it with:
    ```bash
    python -m spacy download {selected_model}
    ```
    """)
    st.stop()

st.success(f"Using SpaCy model: {selected_model}")


def clean_html(html_content):
    """Clean HTML content and extract text."""
    if pd.isna(html_content):
        return ""

    soup = BeautifulSoup(str(html_content), 'html.parser')

    # Remove specific tags that often contain noise
    for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'ol', 'ul', 'table']):
        tag.decompose()

    return soup.get_text(separator=' ', strip=True)


def extract_entities(text, nlp_model, allowed_types):
    """Extract entities from text using spaCy."""
    if not text or pd.isna(text):
        return []

    # Truncate very long texts to avoid memory issues
    max_chars = 100000
    if len(text) > max_chars:
        text = text[:max_chars]

    doc = nlp_model(text)
    entities = [
        (ent.text.strip(), ent.label_)
        for ent in doc.ents
        if ent.label_ in allowed_types and ent.text.strip()
    ]
    return entities


# Input methods
st.subheader("Input Content")
input_method = st.radio(
    "Choose input method:",
    ["Text Area", "CSV Upload"],
    horizontal=True
)

if input_method == "Text Area":
    text_input = st.text_area(
        "Enter text or HTML content",
        height=300,
        placeholder="Paste your text or HTML content here..."
    )

    if text_input and st.button("🔍 Extract Entities", type="primary"):
        with st.spinner("Extracting entities..."):
            # Clean HTML if present
            cleaned_text = clean_html(text_input)

            # Extract entities
            entities = extract_entities(cleaned_text, nlp, selected_types)

            if entities:
                # Create dataframe
                df = pd.DataFrame(entities, columns=['Entity', 'Label'])

                # Count frequencies
                entity_counts = df.groupby(['Entity', 'Label']).size().reset_index(name='Count')
                entity_counts = entity_counts.sort_values('Count', ascending=False)

                st.subheader("Extracted Entities")

                # Summary metrics
                col1, col2, col3 = st.columns(3)
                with col1:
                    st.metric("Total Entities", len(entities))
                with col2:
                    st.metric("Unique Entities", len(entity_counts))
                with col3:
                    st.metric("Entity Types", entity_counts['Label'].nunique())

                # Display by type
                st.subheader("Entities by Type")
                for label in sorted(entity_counts['Label'].unique()):
                    with st.expander(f"{label} ({len(entity_counts[entity_counts['Label'] == label])})"):
                        type_df = entity_counts[entity_counts['Label'] == label][['Entity', 'Count']]
                        st.dataframe(type_df, use_container_width=True, hide_index=True)

                # Full table
                st.subheader("All Entities")
                st.dataframe(entity_counts, use_container_width=True, hide_index=True)

                # Download
                csv_buffer = BytesIO()
                entity_counts.to_csv(csv_buffer, index=False)
                csv_buffer.seek(0)

                st.download_button(
                    label="📥 Download Entities (CSV)",
                    data=csv_buffer,
                    file_name="extracted_entities.csv",
                    mime="text/csv"
                )
            else:
                st.warning("No entities found in the provided text.")

else:
    uploaded_file = st.file_uploader(
        "Upload CSV with content",
        type=['csv', 'xlsx'],
        help="Upload a CSV/Excel file containing text content to analyze"
    )

    if uploaded_file:
        # Load file
        if uploaded_file.name.endswith('.xlsx'):
            df = pd.read_excel(uploaded_file)
        else:
            df = pd.read_csv(uploaded_file)

        st.write("Preview of uploaded data:")
        st.dataframe(df.head())

        col1, col2 = st.columns(2)

        with col1:
            content_column = st.selectbox(
                "Select content column",
                options=df.columns.tolist(),
                help="Column containing the text/HTML to analyze"
            )

        with col2:
            id_column = st.selectbox(
                "Select ID column (optional)",
                options=['None'] + df.columns.tolist(),
                help="Column to use as identifier (e.g., URL, Address)"
            )

        if st.button("🔍 Extract Entities", type="primary"):
            progress_bar = st.progress(0)
            status_text = st.empty()

            all_results = []

            for idx, row in df.iterrows():
                content = row[content_column]
                identifier = row[id_column] if id_column != 'None' else idx

                # Clean and extract
                cleaned = clean_html(content)
                entities = extract_entities(cleaned, nlp, selected_types)

                for entity, label in entities:
                    all_results.append({
                        'Source': identifier,
                        'Entity': entity,
                        'Label': label
                    })

                progress_bar.progress((idx + 1) / len(df))
                status_text.text(f"Processing: {idx + 1}/{len(df)}")

            progress_bar.empty()
            status_text.empty()

            if all_results:
                results_df = pd.DataFrame(all_results)

                # Summary
                st.subheader("Results Summary")
                col1, col2, col3 = st.columns(3)

                with col1:
                    st.metric("Total Entities Found", len(results_df))
                with col2:
                    st.metric("Unique Entities", results_df['Entity'].nunique())
                with col3:
                    st.metric("Sources Processed", results_df['Source'].nunique())

                # Entity frequency across all sources
                st.subheader("Top Entities (All Sources)")
                top_entities = results_df.groupby(['Entity', 'Label']).size().reset_index(name='Count')
                top_entities = top_entities.sort_values('Count', ascending=False).head(50)
                st.dataframe(top_entities, use_container_width=True, hide_index=True)

                # Full results
                with st.expander("View All Results"):
                    st.dataframe(results_df, use_container_width=True, hide_index=True)

                # Downloads
                st.subheader("Download Results")

                col1, col2 = st.columns(2)

                with col1:
                    # Full results
                    csv_buffer = BytesIO()
                    results_df.to_csv(csv_buffer, index=False)
                    csv_buffer.seek(0)

                    st.download_button(
                        label="📥 Download Full Results (CSV)",
                        data=csv_buffer,
                        file_name="entities_by_source.csv",
                        mime="text/csv"
                    )

                with col2:
                    # Aggregated counts
                    agg_buffer = BytesIO()
                    top_entities_full = results_df.groupby(['Entity', 'Label']).size().reset_index(name='Count')
                    top_entities_full = top_entities_full.sort_values('Count', ascending=False)
                    top_entities_full.to_csv(agg_buffer, index=False)
                    agg_buffer.seek(0)

                    st.download_button(
                        label="📥 Download Entity Counts (CSV)",
                        data=agg_buffer,
                        file_name="entity_counts.csv",
                        mime="text/csv"
                    )
            else:
                st.warning("No entities found in the uploaded content.")

if input_method == "Text Area" and not text_input:
    st.info("👆 Enter text or upload a file to get started.")

    st.markdown("""
    ### Entity Types Explained
    | Type | Description | Examples |
    |------|-------------|----------|
    | PERSON | People names | Elon Musk, Shakespeare |
    | ORG | Organizations | Google, NASA, WHO |
    | GPE | Countries/Cities | London, United States |
    | LOC | Locations | Mount Everest, Pacific Ocean |
    | PRODUCT | Products | iPhone, Tesla Model S |
    | EVENT | Events | World War II, Olympics |
    | WORK_OF_ART | Creative works | Mona Lisa, Hamlet |

    ### Requirements
    - SpaCy library installed
    - SpaCy English model downloaded

    ```bash
    pip install spacy
    python -m spacy download en_core_web_sm  # Small (fast)
    python -m spacy download en_core_web_md  # Medium
    python -m spacy download en_core_web_lg  # Large (best)
    ```

    ### Use Cases
    - Content entity analysis for semantic SEO
    - Identifying key topics in competitor content
    - Building comprehensive topic coverage
    - Understanding entity relationships in content
    """)