search-solved-public-seo/content-analysis/entity-extractor/entity_extractor_cli.py at main · admariner/search-solved-public-seo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
"""
Entity Extractor - CLI Version

Extract named entities from text using SpaCy NLP.

Usage:
    python entity_extractor_cli.py --input content.csv --output entities.csv

Author: Lee Foot
Website: https://www.leefoot.com
"""

import argparse
import pandas as pd
from bs4 import BeautifulSoup
import sys

try:
    import spacy
except ImportError:
    print("Error: SpaCy is not installed. Install with: pip install spacy")
    print("Then download a model: python -m spacy download en_core_web_sm")
    sys.exit(1)


def clean_html(html_content):
    if pd.isna(html_content):
        return ""
    soup = BeautifulSoup(str(html_content), 'html.parser')
    for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'ol', 'ul', 'table']):
        tag.decompose()
    return soup.get_text(separator=' ', strip=True)


def extract_entities(text, nlp_model, allowed_types):
    if not text or pd.isna(text):
        return []
    max_chars = 100000
    if len(text) > max_chars:
        text = text[:max_chars]
    doc = nlp_model(text)
    entities = [(ent.text.strip(), ent.label_) for ent in doc.ents if ent.label_ in allowed_types and ent.text.strip()]
    return entities


def main():
    parser = argparse.ArgumentParser(description='Extract named entities from text using SpaCy')
    parser.add_argument('--input', required=True, help='Input CSV with content')
    parser.add_argument('--output', default='entities.csv', help='Output CSV path')
    parser.add_argument('--content-col', default='content', help='Content column name')
    parser.add_argument('--id-col', help='Optional ID column (e.g., URL)')
    parser.add_argument('--model', default='en_core_web_sm', help='SpaCy model name')
    parser.add_argument('--entity-types', nargs='+', default=['PERSON', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT'],
                        help='Entity types to extract')

    args = parser.parse_args()

    print(f"Loading SpaCy model: {args.model}")
    try:
        nlp = spacy.load(args.model)
    except OSError:
        print(f"Error: Model '{args.model}' not found. Download with: python -m spacy download {args.model}")
        sys.exit(1)

    print(f"Loading content from: {args.input}")
    df = pd.read_csv(args.input)

    # Find content column
    content_col = None
    for col in df.columns:
        if col.lower() == args.content_col.lower():
            content_col = col
            break
    if not content_col:
        content_col = df.columns[0]

    all_results = []
    total = len(df)

    for idx, row in df.iterrows():
        content = row[content_col]
        identifier = row[args.id_col] if args.id_col and args.id_col in df.columns else idx

        cleaned = clean_html(content)
        entities = extract_entities(cleaned, nlp, args.entity_types)

        for entity, label in entities:
            all_results.append({
                'Source': identifier,
                'Entity': entity,
                'Label': label
            })

        if (idx + 1) % 10 == 0:
            print(f"  Processed {idx + 1}/{total} rows...")

    if all_results:
        df_results = pd.DataFrame(all_results)

        # Aggregate counts
        entity_counts = df_results.groupby(['Entity', 'Label']).size().reset_index(name='Count')
        entity_counts = entity_counts.sort_values('Count', ascending=False)
        entity_counts.to_csv(args.output, index=False, encoding='utf-8-sig')

        print(f"\nResults saved to: {args.output}")
        print(f"  Total entities found: {len(df_results)}")
        print(f"  Unique entities: {len(entity_counts)}")
    else:
        print("No entities found in the content.")
        sys.exit(1)


if __name__ == '__main__':
    main()