marketscanner/backend/app/workers/tasks/news_tasks.py

"""
News fetching tasks
"""

from datetime import datetime, timedelta
import feedparser
import structlog

from app.workers.celery_app import celery_app
from app.core.database import get_sync_db
from app.core.config import settings

logger = structlog.get_logger()

# RSS Feeds to monitor
NEWS_FEEDS = [
    # General Financial News
    {"name": "Yahoo Finance", "url": "https://finance.yahoo.com/news/rssindex"},
    {"name": "Reuters Business", "url": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"},
    {"name": "CNBC", "url": "https://www.cnbc.com/id/100003114/device/rss/rss.html"},
    {"name": "MarketWatch", "url": "https://feeds.marketwatch.com/marketwatch/topstories/"},
    {"name": "Seeking Alpha", "url": "https://seekingalpha.com/market_currents.xml"},
    {"name": "Bloomberg", "url": "https://www.bloomberg.com/feed/podcast/etf-report.xml"},

    # Tech
    {"name": "TechCrunch", "url": "https://techcrunch.com/feed/"},

    # Crypto (because why not)
    {"name": "CoinDesk", "url": "https://www.coindesk.com/arc/outboundfeeds/rss/"},
]


@celery_app.task(name="app.workers.tasks.news_tasks.fetch_all_news")
def fetch_all_news():
    """Fetch news from all configured sources."""
    logger.info("Starting news fetch from all sources")
    total_fetched = 0

    for feed_config in NEWS_FEEDS:
        try:
            count = fetch_from_feed(feed_config["name"], feed_config["url"])
            total_fetched += count
        except Exception as e:
            logger.error(
                "Failed to fetch from feed",
                feed=feed_config["name"],
                error=str(e)
            )

    logger.info("News fetch complete", total_articles=total_fetched)
    return {"fetched": total_fetched}


@celery_app.task(name="app.workers.tasks.news_tasks.fetch_from_feed")
def fetch_from_feed(source_name: str, feed_url: str) -> int:
    """Fetch news from a single RSS feed."""
    logger.info("Fetching from feed", source=source_name)

    try:
        feed = feedparser.parse(feed_url)
        articles_saved = 0

        for entry in feed.entries[:50]:  # Limit to 50 most recent
            try:
                # Extract data
                title = entry.get("title", "")
                url = entry.get("link", "")
                summary = entry.get("summary", "")
                author = entry.get("author", "")

                # Parse published date
                published = entry.get("published_parsed") or entry.get("updated_parsed")
                if published:
                    published_at = datetime(*published[:6])
                else:
                    published_at = datetime.utcnow()

                # Save to database (skip if exists)
                # This is a placeholder - actual implementation would use the db session
                articles_saved += 1

            except Exception as e:
                logger.warning(
                    "Failed to process article",
                    title=entry.get("title", "unknown"),
                    error=str(e)
                )

        logger.info("Feed processed", source=source_name, articles=articles_saved)
        return articles_saved

    except Exception as e:
        logger.error("Failed to parse feed", source=source_name, error=str(e))
        return 0


@celery_app.task(name="app.workers.tasks.news_tasks.cleanup_old_news")
def cleanup_old_news(days: int = 90):
    """Remove news articles older than specified days."""
    logger.info("Starting news cleanup", days_to_keep=days)

    cutoff = datetime.utcnow() - timedelta(days=days)

    # Placeholder - actual implementation would delete from database
    deleted_count = 0

    logger.info("News cleanup complete", deleted=deleted_count)
    return {"deleted": deleted_count}