How to Build a Trend Forecasting Tool with Social Scraping

How to Build a Trend Forecasting Tool with Social Scraping

Trends emerge on social media before hitting mainstream. By scraping platforms systematically, you can detect rising trends days before they become obvious.

Data Collection

import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import sqlite3, time

class TrendCollector:
    def __init__(self, db_path='trends.db', api_key=None):
        self.db = sqlite3.connect(db_path)
        self.api_key = api_key
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'TrendResearch/1.0'})
        self._init_db()

    def _init_db(self):
        self.db.executescript('''
            CREATE TABLE IF NOT EXISTS mentions (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                platform TEXT, keyword TEXT, content TEXT,
                engagement INTEGER, timestamp DATETIME, url TEXT);
            CREATE INDEX IF NOT EXISTS idx_kw ON mentions(keyword, timestamp);
        ''')

    def _fetch(self, url):
        if self.api_key:
            return self.session.get(
                f"http://api.scraperapi.com?api_key={self.api_key}&url={url}&render=true")
        return self.session.get(url)

    def collect_reddit(self, keyword, subs=None):
        subs = subs or ['all']
        mentions = []
        for sub in subs:
            resp = self._fetch(
                f"https://old.reddit.com/r/{sub}/search?q={keyword}&sort=new&t=week")
            soup = BeautifulSoup(resp.text, 'html.parser')
            for post in soup.select('.thing.link'):
                title = post.select_one('a.title')
                score = post.select_one('.score.unvoted')
                if title:
                    eng = 0
                    if score:
                        try: eng = int(score.get_text(strip=True))
                        except: pass
                    mentions.append({
                        'platform': 'reddit', 'keyword': keyword,
                        'content': title.get_text(strip=True),
                        'engagement': eng, 'url': title.get('href',''),
                        'timestamp': datetime.now().isoformat()
                    })
            time.sleep(2)
        for m in mentions:
            self.db.execute(
                'INSERT INTO mentions (platform,keyword,content,engagement,timestamp,url) VALUES (?,?,?,?,?,?)',
                (m['platform'],m['keyword'],m['content'],m['engagement'],m['timestamp'],m['url']))
        self.db.commit()
        return mentions

Velocity Engine

The key insight: acceleration matters more than volume.

class VelocityEngine:
    def __init__(self, db):
        self.db = db

    def velocity(self, keyword, hours=24):
        now = datetime.now()
        cur_start = now - timedelta(hours=hours)
        prev_start = cur_start - timedelta(hours=hours)
        cur = self._count(keyword, cur_start, now)
        prev = self._count(keyword, prev_start, cur_start)
        if prev == 0: return float('inf') if cur > 0 else 0
        return round((cur - prev) / prev, 4)

    def _count(self, kw, start, end):
        c = self.db.execute(
            'SELECT COUNT(*) FROM mentions WHERE keyword=? AND timestamp BETWEEN ? AND ?',
            (kw, start.isoformat(), end.isoformat()))
        return c.fetchone()[0]

    def engagement_velocity(self, keyword, hours=24):
        now = datetime.now()
        cur_start = now - timedelta(hours=hours)
        prev_start = cur_start - timedelta(hours=hours)
        cur = self._engagement(keyword, cur_start, now)
        prev = self._engagement(keyword, prev_start, cur_start)
        if prev == 0: return float('inf') if cur > 0 else 0
        return round((cur - prev) / prev, 4)

    def _engagement(self, kw, start, end):
        c = self.db.execute(
            'SELECT COALESCE(SUM(engagement),0) FROM mentions WHERE keyword=? AND timestamp BETWEEN ? AND ?',
            (kw, start.isoformat(), end.isoformat()))
        return c.fetchone()[0]

Breakout Detection

class BreakoutDetector:
    def __init__(self, engine):
        self.engine = engine

    def detect(self, keywords, threshold=0.5):
        results = []
        for kw in keywords:
            mv = self.engine.velocity(kw)
            ev = self.engine.engagement_velocity(kw)
            score = (mv * 0.4) + (ev * 0.6)
            if score > threshold:
                results.append({'keyword': kw, 'mention_vel': mv,
                               'engagement_vel': ev, 'score': round(score, 3)})
        return sorted(results, key=lambda x: x['score'], reverse=True)

Trend Forecasting

class Forecaster:
    def predict(self, keyword, db, points=7):
        cursor = db.execute('''
            SELECT DATE(timestamp), COUNT(*) FROM mentions
            WHERE keyword=? GROUP BY DATE(timestamp)
            ORDER BY DATE(timestamp) DESC LIMIT ?
        ''', (keyword, points))
        data = cursor.fetchall()
        if len(data) < 3: return {'prediction': 'insufficient_data'}
        vols = [r[1] for r in reversed(data)]
        n = len(vols)
        xm = (n-1)/2
        ym = sum(vols)/n
        slope = sum((i-xm)*(v-ym) for i,v in enumerate(vols))
        slope /= sum((i-xm)**2 for i in range(n))
        rate = slope/ym if ym > 0 else 0
        if rate > 0.15: p = 'accelerating'
        elif rate > 0.05: p = 'growing'
        elif rate > -0.05: p = 'stable'
        else: p = 'declining'
        return {'keyword': keyword, 'prediction': p, 'growth_rate': round(rate, 4)}

Social platforms are hard to scrape. ScraperAPI handles anti-bot protections. ThorData provides residential rotation. Track rates with ScrapeOps.

Follow for more Python data science tutorials.

Total
0
Shares
Leave a Reply

Your email address will not be published. Required fields are marked *

Previous Post

You can now transfer your chats and personal information from other chatbots directly into Gemini

Next Post

Anthropic wins injunction against Trump administration over Defense Department saga

Related Posts