Add initial repo with data generator

This commit is contained in:
thecodebranch 2024-07-01 09:39:44 -06:00
commit 90f3d0df6e
11 changed files with 581 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
__pycache__
*.db
*checkpoints
output

0
README.md Normal file
View File

0
__init__.py Normal file
View File

124
data/data_generator.py Normal file
View File

@ -0,0 +1,124 @@
import csv
import random
import string
from datetime import datetime
from datetime import timedelta
class StockDataGenerator:
def __init__(
self,
daily_price_fluctuation=.02,
daily_volume_fluctuation=.05,
date_format="%Y-%m-%d",
num_companies=4000,
start_date="2000-01-01",
end_date="2024-01-01",
csv_output_file="./output/tickers.csv",
chunk_size=500000,
volume_adjustment_factor=5
):
self.daily_price_fluctuation = daily_price_fluctuation
self.daily_volume_fluctuation = daily_volume_fluctuation
self.date_format = date_format
self.num_companies = num_companies
self.start_date = datetime.strptime(start_date, self.date_format)
self.end_date = datetime.strptime(end_date, self.date_format)
self.chunk_size = chunk_size
self.csv_output_file = csv_output_file
self.volume_adjustment_factor = volume_adjustment_factor
def generate_raw_data(self, start_date="2000-01-01", end_date="2024-01-01"):
tickers = self._generate_fake_tickers()
dates = self._generate_dates()
ticker_count = 0
for ticker in tickers:
daily_ticker_data = self._generate_stock_data(ticker, dates)
self._write_to_csv(daily_ticker_data)
ticker_count += 1
print(f"Generated data for: {ticker} and is {ticker_count} of {len(tickers)}")
def _generate_dates(self):
dates = []
current_date = self.start_date
while current_date <= self.end_date:
if current_date.weekday() < 5:
# Skip weekends.
dates.append(current_date.strftime(self.date_format))
current_date += timedelta(days=1)
return dates
def _generate_fake_tickers(self):
companies = []
while True:
ticker = "".join(random.choices(string.ascii_uppercase, k=4))
if ticker not in companies:
companies.append(ticker)
if len(companies) == self.num_companies:
break
return companies
def _generate_stock_data(self, symbol, dates):
num_days = len(dates)
staring_price = random.uniform(5, 500)
prices = [staring_price]
volumes = []
avg_daily_volume = random.randint(50000, 2000000)
for i in range(1, num_days):
# Simulate price using a random walk
negative_change = -self.daily_price_fluctuation
positive_change = self.daily_price_fluctuation
price = prices[i-1] * (1 + random.uniform(negative_change, positive_change))
prices.append(price)
# Simulate volume with some daily variability
negative_change = -self.daily_volume_fluctuation
positive_change = self.daily_volume_fluctuation
daily_volume = avg_daily_volume * (1 + random.uniform(negative_change, positive_change))
# Adjust volume based on price change magnitude
price_change = abs(prices[i] - prices[i-1]) / prices[i-1]
volume_adjustment = 1 + price_change * self.volume_adjustment_factor
daily_volume *= volume_adjustment
volumes.append(int(daily_volume))
stock_data = []
for i in range(num_days):
open_price = prices[i] * random.uniform(0.95, 1.05)
high_price = max(open_price, prices[i] * random.uniform(1.00, 1.10))
low_price = min(open_price, prices[i] * random.uniform(0.90, 0.99))
close_price = prices[i]
volume = volumes[i-1] if i > 0 else avg_daily_volume
stock_data.append({
"date": dates[i],
"symbol": f"{symbol}-fake",
"open": f"{open_price:.3f}",
"high": f"{high_price:.3f}",
"low": f"{low_price:.3f}",
"close": f"{close_price:.3f}",
"volume": volume
})
return stock_data
def _write_to_csv(self, data):
with open(self.csv_output_file, "a") as file:
fieldnames = ["date", "symbol", "open", "high", "low", "close", "volume"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
if file.tell() == 0:
# Check if new file and write header.
writer.writeheader()
writer.writerows(data)
if __name__ == "__main__":
generator = StockDataGenerator()
generator.generate_raw_data()

18
database/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import scoped_session
from model.base import Base
class Database:
def __init__(self, database_url="sqlite:///finance.db"):
self.engine = create_engine(database_url)
self.session = scoped_session(sessionmaker(bind=self.engine))
def create_tables(self):
Base.metadata.create_all(self.engine)
def drop_tables(self):
Base.metadata.drop_all(self.engine)

6
makefile Normal file
View File

@ -0,0 +1,6 @@
.PHONY: clean
clean:
@echo "Cleaning up..."
find . -type d -name "__pycache__" -exec rm -rf {} +
@echo "Cleanup complete."

50
manage.py Normal file
View File

@ -0,0 +1,50 @@
import csv
from datetime import datetime
from database import Database
from model.ticker import Ticker
database = Database()
def load_database(chunk_size=100000):
database.drop_tables()
database.create_tables()
with open("./data/output/tickers.csv") as file:
try:
rows = []
reader = csv.DictReader(file)
total = 0
for row in reader:
ticker = Ticker(
date=datetime.strptime(row.get("date"), "%Y-%m-%d").date(),
symbol=row.get("symbol"),
open=row.get("open"),
high=row.get("high"),
low=row.get("low"),
close=row.get("close"),
volume=row.get("volume")
)
rows.append(ticker)
if len(rows) >= chunk_size:
database.session.bulk_save_objects(rows)
database.session.commit()
total += chunk_size
print(f"{total} rows inserted")
rows = []
# Insert any remaining rows.
if rows:
database.session.bulk_save_objects(rows)
database.session.commit()
total += len(rows)
print(f"{total} inserted")
except Exception as e:
import pprint
pprint.pprint(e)
if __name__ == "__main__":
load_database()

3
model/base.py Normal file
View File

@ -0,0 +1,3 @@
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

21
model/ticker.py Normal file
View File

@ -0,0 +1,21 @@
from datetime import datetime
from sqlalchemy import Column
from sqlalchemy import DateTime
from sqlalchemy import Float
from sqlalchemy import Integer
from sqlalchemy import String
from model.base import Base
class Ticker(Base):
__tablename__ = 'ticker'
id = Column(Integer, primary_key=True, autoincrement=True)
date = Column(DateTime)
symbol = Column(String(15), nullable=False)
open = Column(Float)
high = Column(Float)
low = Column(Float)
close = Column(Float)
volume = Column(Integer)

0
notebooks/__init__.py Normal file
View File

355
notebooks/eda.ipynb Normal file

File diff suppressed because one or more lines are too long