/**
 * Metadata Extractor for Academic Articles
 * Supports Schema.org, Dublin Core, Citation meta tags, and more
 */

class MetadataExtractor {
    constructor() {
        this.metadata = {
            title: null,
            authors: null,
            journal: null,
            year: null,
            doi: null,
            abstract: null,
            url: window.location.href
        };
    }

    /**
     * Extract all available metadata from the page
     */
    extract() {
        // Check for Semantic Scholar first (site-specific extraction)
        this.extractFromSemanticScholar();

        this.extractFromSchemaOrg();
        this.extractFromMetaTags();
        this.extractFromOpenGraph();
        this.extractFromHTML();

        return this.isValid() ? this.metadata : null;
    }

    /**
     * Extract from Semantic Scholar pages
     */
    extractFromSemanticScholar() {
        // Check if we're on Semantic Scholar
        if (!window.location.hostname.includes('semanticscholar.org')) {
            return;
        }

        try {
            // Extract from JSON-LD that Semantic Scholar uses
            const scripts = document.querySelectorAll('script[type="application/ld+json"]');
            for (const script of scripts) {
                try {
                    const data = JSON.parse(script.textContent);
                    if (data['@type'] === 'ScholarlyArticle') {
                        this.metadata.title = this.metadata.title || data.name || data.headline;
                        this.metadata.abstract = this.metadata.abstract || data.description;

                        // Extract authors
                        if (data.author) {
                            if (Array.isArray(data.author)) {
                                this.metadata.authors = this.metadata.authors ||
                                    data.author.map(a => a.name).join('; ');
                            } else if (data.author.name) {
                                this.metadata.authors = this.metadata.authors || data.author.name;
                            }
                        }

                        // Extract year
                        if (data.datePublished) {
                            this.metadata.year = this.metadata.year || new Date(data.datePublished).getFullYear();
                        }

                        // Extract DOI from identifier
                        if (data.identifier) {
                            this.metadata.doi = this.metadata.doi || this.extractDOI(data.identifier);
                        }
                    }
                } catch (e) {
                }
            }

            // Try to extract from page structure if JSON-LD didn't work
            if (!this.metadata.title) {
                const titleEl = document.querySelector('h1[data-test-id="paper-detail-title"]');
                if (titleEl) {
                    this.metadata.title = titleEl.textContent.trim();
                }
            }

            // Extract authors from author list
            if (!this.metadata.authors) {
                const authorEls = document.querySelectorAll('[data-test-id="author-list"] a, .author-list a');
                if (authorEls.length > 0) {
                    this.metadata.authors = Array.from(authorEls)
                        .map(el => el.textContent.trim())
                        .filter(Boolean)
                        .join('; ');
                }
            }

            // Extract year from publication info
            if (!this.metadata.year) {
                const yearMatch = document.body.textContent.match(/\b(19|20)\d{2}\b/);
                if (yearMatch) {
                    this.metadata.year = parseInt(yearMatch[0]);
                }
            }

            // Extract abstract
            if (!this.metadata.abstract) {
                const abstractEl = document.querySelector('[data-test-id="paper-abstract-text"], .abstract-text, .paper-abstract');
                if (abstractEl) {
                    this.metadata.abstract = abstractEl.textContent.trim();
                }
            }

            // Extract venue/journal
            if (!this.metadata.journal) {
                const venueEl = document.querySelector('[data-test-id="venue-metadata"], .venue-name');
                if (venueEl) {
                    this.metadata.journal = venueEl.textContent.trim();
                }
            }

        } catch (e) {
        }
    }

    /**
     * Extract from Schema.org JSON-LD
     */
    extractFromSchemaOrg() {
        const scripts = document.querySelectorAll('script[type="application/ld+json"]');

        for (const script of scripts) {
            try {
                const data = JSON.parse(script.textContent);

                // Handle single object or array
                const items = Array.isArray(data) ? data : [data];

                for (const item of items) {
                    if (item['@type'] === 'ScholarlyArticle' || item['@type'] === 'Article') {
                        this.metadata.title = this.metadata.title || item.headline || item.name;
                        this.metadata.abstract = this.metadata.abstract || item.description;
                        this.metadata.doi = this.metadata.doi || this.extractDOI(item.identifier);

                        // Extract authors
                        if (item.author) {
                            this.metadata.authors = this.metadata.authors || this.parseAuthors(item.author);
                        }

                        // Extract publication date
                        if (item.datePublished) {
                            this.metadata.year = this.metadata.year || new Date(item.datePublished).getFullYear();
                        }

                        // Extract journal name
                        if (item.publisher && item.publisher.name) {
                            this.metadata.journal = this.metadata.journal || item.publisher.name;
                        }
                    }
                }
            } catch (e) {
            }
        }
    }

    /**
     * Extract from meta tags (Dublin Core, Citation, etc.)
     */
    extractFromMetaTags() {
        // Citation meta tags (most common in academic sites)
        this.metadata.title = this.metadata.title ||
            this.getMetaContent('citation_title') ||
            this.getMetaContent('DC.title');

        // Authors
        const authors = this.getAllMetaContent('citation_author') ||
                       this.getAllMetaContent('DC.creator');
        if (authors.length > 0 && !this.metadata.authors) {
            this.metadata.authors = authors.join('; ');
        }

        // Journal
        this.metadata.journal = this.metadata.journal ||
            this.getMetaContent('citation_journal_title') ||
            this.getMetaContent('citation_conference_title') ||
            this.getMetaContent('DC.publisher');

        // Publication date/year
        const pubDate = this.getMetaContent('citation_publication_date') ||
                       this.getMetaContent('DC.date');
        if (pubDate && !this.metadata.year) {
            this.metadata.year = this.extractYear(pubDate);
        }

        // DOI
        this.metadata.doi = this.metadata.doi ||
            this.getMetaContent('citation_doi') ||
            this.getMetaContent('DC.identifier');

        // Abstract
        this.metadata.abstract = this.metadata.abstract ||
            this.getMetaContent('citation_abstract') ||
            this.getMetaContent('DC.description');
    }

    /**
     * Extract from OpenGraph tags
     */
    extractFromOpenGraph() {
        this.metadata.title = this.metadata.title ||
            this.getMetaContent('og:title');

        this.metadata.abstract = this.metadata.abstract ||
            this.getMetaContent('og:description');
    }

    /**
     * Extract from HTML content as fallback
     */
    extractFromHTML() {
        // Try to find title
        if (!this.metadata.title) {
            this.metadata.title = document.querySelector('h1')?.textContent?.trim() ||
                                document.title;
        }

        // Try to find DOI in page text
        if (!this.metadata.doi) {
            const doiMatch = document.body.textContent.match(/10\.\d{4,}\/[^\s]+/);
            if (doiMatch) {
                this.metadata.doi = doiMatch[0];
            }
        }

        // Try to extract year from copyright or text
        if (!this.metadata.year) {
            const yearMatch = document.body.textContent.match(/(?:©|\bcopyright\b|published)\s*(\d{4})/i);
            if (yearMatch) {
                this.metadata.year = parseInt(yearMatch[1]);
            }
        }
    }

    /**
     * Get meta tag content by name
     */
    getMetaContent(name) {
        const meta = document.querySelector(
            `meta[name="${name}"], meta[property="${name}"]`
        );
        return meta?.getAttribute('content')?.trim() || null;
    }

    /**
     * Get all meta tags with the same name (for authors, etc.)
     */
    getAllMetaContent(name) {
        const metas = document.querySelectorAll(
            `meta[name="${name}"], meta[property="${name}"]`
        );
        return Array.from(metas).map(m => m.getAttribute('content')?.trim()).filter(Boolean);
    }

    /**
     * Parse authors from various formats
     */
    parseAuthors(authorData) {
        if (typeof authorData === 'string') {
            return authorData;
        }

        if (Array.isArray(authorData)) {
            return authorData.map(author => {
                if (typeof author === 'string') return author;
                return author.name || `${author.givenName || ''} ${author.familyName || ''}`.trim();
            }).join('; ');
        }

        if (authorData.name) {
            return authorData.name;
        }

        return null;
    }

    /**
     * Extract DOI from various identifier formats
     */
    extractDOI(identifier) {
        if (!identifier) return null;

        if (typeof identifier === 'string') {
            // Check if it's already a DOI
            if (identifier.startsWith('10.')) return identifier;
            // Extract from URL
            const match = identifier.match(/10\.\d{4,}\/[^\s]+/);
            return match ? match[0] : null;
        }

        if (Array.isArray(identifier)) {
            for (const id of identifier) {
                const doi = this.extractDOI(id);
                if (doi) return doi;
            }
        }

        return null;
    }

    /**
     * Extract year from date string
     */
    extractYear(dateStr) {
        const year = parseInt(dateStr.match(/\d{4}/)?.[0]);
        return (year >= 1900 && year <= new Date().getFullYear() + 1) ? year : null;
    }

    /**
     * Check if metadata has minimum required fields
     */
    isValid() {
        // Must have at least title and one other field
        const hasTitle = Boolean(this.metadata.title && this.metadata.title.length > 10);
        const hasAuthor = Boolean(this.metadata.authors);
        const hasJournal = Boolean(this.metadata.journal);
        const hasYear = Boolean(this.metadata.year);
        const hasDOI = Boolean(this.metadata.doi);

        const otherFields = [hasAuthor, hasJournal, hasYear, hasDOI].filter(Boolean).length;

        return hasTitle && otherFields >= 2;
    }

    /**
     * Get formatted metadata for display
     */
    getFormattedSummary() {
        const parts = [];

        if (this.metadata.authors) {
            const authors = this.metadata.authors.split(';').slice(0, 3).join(';');
            parts.push(authors);
        }

        if (this.metadata.year) {
            parts.push(`(${this.metadata.year})`);
        }

        if (this.metadata.journal) {
            parts.push(this.metadata.journal);
        }

        return parts.join(' ');
    }
}

// Export for use in content script
if (typeof module !== 'undefined' && module.exports) {
    module.exports = MetadataExtractor;
}
