""" News fetching tasks """ from datetime import datetime, timedelta import feedparser import structlog from app.workers.celery_app import celery_app from app.core.database import get_sync_db from app.core.config import settings logger = structlog.get_logger() # RSS Feeds to monitor NEWS_FEEDS = [ # General Financial News {"name": "Yahoo Finance", "url": "https://finance.yahoo.com/news/rssindex"}, {"name": "Reuters Business", "url": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"}, {"name": "CNBC", "url": "https://www.cnbc.com/id/100003114/device/rss/rss.html"}, {"name": "MarketWatch", "url": "https://feeds.marketwatch.com/marketwatch/topstories/"}, {"name": "Seeking Alpha", "url": "https://seekingalpha.com/market_currents.xml"}, {"name": "Bloomberg", "url": "https://www.bloomberg.com/feed/podcast/etf-report.xml"}, # Tech {"name": "TechCrunch", "url": "https://techcrunch.com/feed/"}, # Crypto (because why not) {"name": "CoinDesk", "url": "https://www.coindesk.com/arc/outboundfeeds/rss/"}, ] @celery_app.task(name="app.workers.tasks.news_tasks.fetch_all_news") def fetch_all_news(): """Fetch news from all configured sources.""" logger.info("Starting news fetch from all sources") total_fetched = 0 for feed_config in NEWS_FEEDS: try: count = fetch_from_feed(feed_config["name"], feed_config["url"]) total_fetched += count except Exception as e: logger.error( "Failed to fetch from feed", feed=feed_config["name"], error=str(e) ) logger.info("News fetch complete", total_articles=total_fetched) return {"fetched": total_fetched} @celery_app.task(name="app.workers.tasks.news_tasks.fetch_from_feed") def fetch_from_feed(source_name: str, feed_url: str) -> int: """Fetch news from a single RSS feed.""" logger.info("Fetching from feed", source=source_name) try: feed = feedparser.parse(feed_url) articles_saved = 0 for entry in feed.entries[:50]: # Limit to 50 most recent try: # Extract data title = entry.get("title", "") url = entry.get("link", "") summary = entry.get("summary", "") author = entry.get("author", "") # Parse published date published = entry.get("published_parsed") or entry.get("updated_parsed") if published: published_at = datetime(*published[:6]) else: published_at = datetime.utcnow() # Save to database (skip if exists) # This is a placeholder - actual implementation would use the db session articles_saved += 1 except Exception as e: logger.warning( "Failed to process article", title=entry.get("title", "unknown"), error=str(e) ) logger.info("Feed processed", source=source_name, articles=articles_saved) return articles_saved except Exception as e: logger.error("Failed to parse feed", source=source_name, error=str(e)) return 0 @celery_app.task(name="app.workers.tasks.news_tasks.cleanup_old_news") def cleanup_old_news(days: int = 90): """Remove news articles older than specified days.""" logger.info("Starting news cleanup", days_to_keep=days) cutoff = datetime.utcnow() - timedelta(days=days) # Placeholder - actual implementation would delete from database deleted_count = 0 logger.info("News cleanup complete", deleted=deleted_count) return {"deleted": deleted_count}