Add initial repo with data generator
This commit is contained in:
commit
90f3d0df6e
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
__pycache__
|
||||
*.db
|
||||
*checkpoints
|
||||
output
|
0
__init__.py
Normal file
0
__init__.py
Normal file
124
data/data_generator.py
Normal file
124
data/data_generator.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
import csv
|
||||
import random
|
||||
import string
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
|
||||
|
||||
class StockDataGenerator:
|
||||
def __init__(
|
||||
self,
|
||||
daily_price_fluctuation=.02,
|
||||
daily_volume_fluctuation=.05,
|
||||
date_format="%Y-%m-%d",
|
||||
num_companies=4000,
|
||||
start_date="2000-01-01",
|
||||
end_date="2024-01-01",
|
||||
csv_output_file="./output/tickers.csv",
|
||||
chunk_size=500000,
|
||||
volume_adjustment_factor=5
|
||||
):
|
||||
self.daily_price_fluctuation = daily_price_fluctuation
|
||||
self.daily_volume_fluctuation = daily_volume_fluctuation
|
||||
self.date_format = date_format
|
||||
self.num_companies = num_companies
|
||||
self.start_date = datetime.strptime(start_date, self.date_format)
|
||||
self.end_date = datetime.strptime(end_date, self.date_format)
|
||||
self.chunk_size = chunk_size
|
||||
self.csv_output_file = csv_output_file
|
||||
self.volume_adjustment_factor = volume_adjustment_factor
|
||||
|
||||
|
||||
def generate_raw_data(self, start_date="2000-01-01", end_date="2024-01-01"):
|
||||
tickers = self._generate_fake_tickers()
|
||||
dates = self._generate_dates()
|
||||
|
||||
ticker_count = 0
|
||||
for ticker in tickers:
|
||||
daily_ticker_data = self._generate_stock_data(ticker, dates)
|
||||
self._write_to_csv(daily_ticker_data)
|
||||
ticker_count += 1
|
||||
print(f"Generated data for: {ticker} and is {ticker_count} of {len(tickers)}")
|
||||
|
||||
|
||||
def _generate_dates(self):
|
||||
dates = []
|
||||
current_date = self.start_date
|
||||
while current_date <= self.end_date:
|
||||
if current_date.weekday() < 5:
|
||||
# Skip weekends.
|
||||
dates.append(current_date.strftime(self.date_format))
|
||||
current_date += timedelta(days=1)
|
||||
return dates
|
||||
|
||||
|
||||
def _generate_fake_tickers(self):
|
||||
companies = []
|
||||
while True:
|
||||
ticker = "".join(random.choices(string.ascii_uppercase, k=4))
|
||||
if ticker not in companies:
|
||||
companies.append(ticker)
|
||||
|
||||
if len(companies) == self.num_companies:
|
||||
break
|
||||
return companies
|
||||
|
||||
|
||||
def _generate_stock_data(self, symbol, dates):
|
||||
num_days = len(dates)
|
||||
staring_price = random.uniform(5, 500)
|
||||
prices = [staring_price]
|
||||
volumes = []
|
||||
avg_daily_volume = random.randint(50000, 2000000)
|
||||
|
||||
for i in range(1, num_days):
|
||||
# Simulate price using a random walk
|
||||
negative_change = -self.daily_price_fluctuation
|
||||
positive_change = self.daily_price_fluctuation
|
||||
|
||||
price = prices[i-1] * (1 + random.uniform(negative_change, positive_change))
|
||||
prices.append(price)
|
||||
|
||||
# Simulate volume with some daily variability
|
||||
negative_change = -self.daily_volume_fluctuation
|
||||
positive_change = self.daily_volume_fluctuation
|
||||
daily_volume = avg_daily_volume * (1 + random.uniform(negative_change, positive_change))
|
||||
|
||||
# Adjust volume based on price change magnitude
|
||||
price_change = abs(prices[i] - prices[i-1]) / prices[i-1]
|
||||
volume_adjustment = 1 + price_change * self.volume_adjustment_factor
|
||||
daily_volume *= volume_adjustment
|
||||
volumes.append(int(daily_volume))
|
||||
|
||||
stock_data = []
|
||||
for i in range(num_days):
|
||||
open_price = prices[i] * random.uniform(0.95, 1.05)
|
||||
high_price = max(open_price, prices[i] * random.uniform(1.00, 1.10))
|
||||
low_price = min(open_price, prices[i] * random.uniform(0.90, 0.99))
|
||||
close_price = prices[i]
|
||||
volume = volumes[i-1] if i > 0 else avg_daily_volume
|
||||
stock_data.append({
|
||||
"date": dates[i],
|
||||
"symbol": f"{symbol}-fake",
|
||||
"open": f"{open_price:.3f}",
|
||||
"high": f"{high_price:.3f}",
|
||||
"low": f"{low_price:.3f}",
|
||||
"close": f"{close_price:.3f}",
|
||||
"volume": volume
|
||||
})
|
||||
return stock_data
|
||||
|
||||
|
||||
def _write_to_csv(self, data):
|
||||
with open(self.csv_output_file, "a") as file:
|
||||
fieldnames = ["date", "symbol", "open", "high", "low", "close", "volume"]
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
if file.tell() == 0:
|
||||
# Check if new file and write header.
|
||||
writer.writeheader()
|
||||
writer.writerows(data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generator = StockDataGenerator()
|
||||
generator.generate_raw_data()
|
18
database/__init__.py
Normal file
18
database/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.orm import scoped_session
|
||||
from model.base import Base
|
||||
|
||||
|
||||
class Database:
|
||||
def __init__(self, database_url="sqlite:///finance.db"):
|
||||
self.engine = create_engine(database_url)
|
||||
self.session = scoped_session(sessionmaker(bind=self.engine))
|
||||
|
||||
|
||||
def create_tables(self):
|
||||
Base.metadata.create_all(self.engine)
|
||||
|
||||
|
||||
def drop_tables(self):
|
||||
Base.metadata.drop_all(self.engine)
|
6
makefile
Normal file
6
makefile
Normal file
|
@ -0,0 +1,6 @@
|
|||
.PHONY: clean
|
||||
|
||||
clean:
|
||||
@echo "Cleaning up..."
|
||||
find . -type d -name "__pycache__" -exec rm -rf {} +
|
||||
@echo "Cleanup complete."
|
50
manage.py
Normal file
50
manage.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import csv
|
||||
from datetime import datetime
|
||||
from database import Database
|
||||
from model.ticker import Ticker
|
||||
|
||||
|
||||
database = Database()
|
||||
|
||||
|
||||
def load_database(chunk_size=100000):
|
||||
database.drop_tables()
|
||||
database.create_tables()
|
||||
|
||||
with open("./data/output/tickers.csv") as file:
|
||||
try:
|
||||
rows = []
|
||||
reader = csv.DictReader(file)
|
||||
total = 0
|
||||
for row in reader:
|
||||
ticker = Ticker(
|
||||
date=datetime.strptime(row.get("date"), "%Y-%m-%d").date(),
|
||||
symbol=row.get("symbol"),
|
||||
open=row.get("open"),
|
||||
high=row.get("high"),
|
||||
low=row.get("low"),
|
||||
close=row.get("close"),
|
||||
volume=row.get("volume")
|
||||
)
|
||||
rows.append(ticker)
|
||||
|
||||
if len(rows) >= chunk_size:
|
||||
database.session.bulk_save_objects(rows)
|
||||
database.session.commit()
|
||||
total += chunk_size
|
||||
print(f"{total} rows inserted")
|
||||
rows = []
|
||||
|
||||
# Insert any remaining rows.
|
||||
if rows:
|
||||
database.session.bulk_save_objects(rows)
|
||||
database.session.commit()
|
||||
total += len(rows)
|
||||
print(f"{total} inserted")
|
||||
except Exception as e:
|
||||
import pprint
|
||||
pprint.pprint(e)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_database()
|
3
model/base.py
Normal file
3
model/base.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
21
model/ticker.py
Normal file
21
model/ticker.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from datetime import datetime
|
||||
from sqlalchemy import Column
|
||||
from sqlalchemy import DateTime
|
||||
from sqlalchemy import Float
|
||||
from sqlalchemy import Integer
|
||||
from sqlalchemy import String
|
||||
from model.base import Base
|
||||
|
||||
|
||||
class Ticker(Base):
|
||||
__tablename__ = 'ticker'
|
||||
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
date = Column(DateTime)
|
||||
symbol = Column(String(15), nullable=False)
|
||||
open = Column(Float)
|
||||
high = Column(Float)
|
||||
low = Column(Float)
|
||||
close = Column(Float)
|
||||
volume = Column(Integer)
|
0
notebooks/__init__.py
Normal file
0
notebooks/__init__.py
Normal file
355
notebooks/eda.ipynb
Normal file
355
notebooks/eda.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user