How to Build a Trend Forecasting Tool with Social Scraping
Trends emerge on social media before hitting mainstream. By scraping platforms systematically, you can detect rising trends days before they become obvious.
Data Collection
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import sqlite3, time
class TrendCollector:
def __init__(self, db_path='trends.db', api_key=None):
self.db = sqlite3.connect(db_path)
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'TrendResearch/1.0'})
self._init_db()
def _init_db(self):
self.db.executescript('''
CREATE TABLE IF NOT EXISTS mentions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
platform TEXT, keyword TEXT, content TEXT,
engagement INTEGER, timestamp DATETIME, url TEXT);
CREATE INDEX IF NOT EXISTS idx_kw ON mentions(keyword, timestamp);
''')
def _fetch(self, url):
if self.api_key:
return self.session.get(
f"http://api.scraperapi.com?api_key={self.api_key}&url={url}&render=true")
return self.session.get(url)
def collect_reddit(self, keyword, subs=None):
subs = subs or ['all']
mentions = []
for sub in subs:
resp = self._fetch(
f"https://old.reddit.com/r/{sub}/search?q={keyword}&sort=new&t=week")
soup = BeautifulSoup(resp.text, 'html.parser')
for post in soup.select('.thing.link'):
title = post.select_one('a.title')
score = post.select_one('.score.unvoted')
if title:
eng = 0
if score:
try: eng = int(score.get_text(strip=True))
except: pass
mentions.append({
'platform': 'reddit', 'keyword': keyword,
'content': title.get_text(strip=True),
'engagement': eng, 'url': title.get('href',''),
'timestamp': datetime.now().isoformat()
})
time.sleep(2)
for m in mentions:
self.db.execute(
'INSERT INTO mentions (platform,keyword,content,engagement,timestamp,url) VALUES (?,?,?,?,?,?)',
(m['platform'],m['keyword'],m['content'],m['engagement'],m['timestamp'],m['url']))
self.db.commit()
return mentions
Velocity Engine
The key insight: acceleration matters more than volume.
class VelocityEngine:
def __init__(self, db):
self.db = db
def velocity(self, keyword, hours=24):
now = datetime.now()
cur_start = now - timedelta(hours=hours)
prev_start = cur_start - timedelta(hours=hours)
cur = self._count(keyword, cur_start, now)
prev = self._count(keyword, prev_start, cur_start)
if prev == 0: return float('inf') if cur > 0 else 0
return round((cur - prev) / prev, 4)
def _count(self, kw, start, end):
c = self.db.execute(
'SELECT COUNT(*) FROM mentions WHERE keyword=? AND timestamp BETWEEN ? AND ?',
(kw, start.isoformat(), end.isoformat()))
return c.fetchone()[0]
def engagement_velocity(self, keyword, hours=24):
now = datetime.now()
cur_start = now - timedelta(hours=hours)
prev_start = cur_start - timedelta(hours=hours)
cur = self._engagement(keyword, cur_start, now)
prev = self._engagement(keyword, prev_start, cur_start)
if prev == 0: return float('inf') if cur > 0 else 0
return round((cur - prev) / prev, 4)
def _engagement(self, kw, start, end):
c = self.db.execute(
'SELECT COALESCE(SUM(engagement),0) FROM mentions WHERE keyword=? AND timestamp BETWEEN ? AND ?',
(kw, start.isoformat(), end.isoformat()))
return c.fetchone()[0]
Breakout Detection
class BreakoutDetector:
def __init__(self, engine):
self.engine = engine
def detect(self, keywords, threshold=0.5):
results = []
for kw in keywords:
mv = self.engine.velocity(kw)
ev = self.engine.engagement_velocity(kw)
score = (mv * 0.4) + (ev * 0.6)
if score > threshold:
results.append({'keyword': kw, 'mention_vel': mv,
'engagement_vel': ev, 'score': round(score, 3)})
return sorted(results, key=lambda x: x['score'], reverse=True)
Trend Forecasting
class Forecaster:
def predict(self, keyword, db, points=7):
cursor = db.execute('''
SELECT DATE(timestamp), COUNT(*) FROM mentions
WHERE keyword=? GROUP BY DATE(timestamp)
ORDER BY DATE(timestamp) DESC LIMIT ?
''', (keyword, points))
data = cursor.fetchall()
if len(data) < 3: return {'prediction': 'insufficient_data'}
vols = [r[1] for r in reversed(data)]
n = len(vols)
xm = (n-1)/2
ym = sum(vols)/n
slope = sum((i-xm)*(v-ym) for i,v in enumerate(vols))
slope /= sum((i-xm)**2 for i in range(n))
rate = slope/ym if ym > 0 else 0
if rate > 0.15: p = 'accelerating'
elif rate > 0.05: p = 'growing'
elif rate > -0.05: p = 'stable'
else: p = 'declining'
return {'keyword': keyword, 'prediction': p, 'growth_rate': round(rate, 4)}
Social platforms are hard to scrape. ScraperAPI handles anti-bot protections. ThorData provides residential rotation. Track rates with ScrapeOps.
Follow for more Python data science tutorials.