{ "cells": [ { "cell_type": "code", "execution_count": 21, "id": "63e84164-3f22-4718-aa50-8c1587985e59", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "\n", "# Prevent scientific notation in pandas output.\n", "pd.set_option('display.float_format', lambda x: '%.3f' % x)\n", "\n", "\n", "# Add the parent directory to sys.path\n", "sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))\n", "\n", "\n", "from database import Database\n", "from model.ticker import Ticker\n", "\n", "\n", "DB_PATH = \"sqlite:///../finance.db\"\n", "\n", "\n", "db = Database(database_url=DB_PATH)" ] }, { "cell_type": "markdown", "id": "59cb5f64-18d3-4f0a-950a-123a6e7bc6f7", "metadata": {}, "source": [ "# Let's Do Some High Level Queries On The Table" ] }, { "cell_type": "code", "execution_count": 4, "id": "3cc8fb31-406d-4635-98e7-4a2b7ca19086", "metadata": {}, "outputs": [], "source": [ "unique_tickers = db.session.query(Ticker.symbol).distinct().count()" ] }, { "cell_type": "code", "execution_count": 5, "id": "7540c4ec-0ec5-4536-80dc-b222e37f43d3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_tickers" ] }, { "cell_type": "markdown", "id": "69153cc0-58ed-4fe1-bac1-ea4179f17a70", "metadata": {}, "source": [ "# Load A Sample Of Data Into Pandas" ] }, { "cell_type": "markdown", "id": "20650f2c-7bdd-4663-bab1-124a667be5a6", "metadata": {}, "source": [ "We want to load a sample since the database has roughly 25 million rows. Sampling makes the demo easier to present without crashing the browser." ] }, { "cell_type": "code", "execution_count": null, "id": "3153db1b-bc97-4d8c-be3a-c1524d34d07c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "id": "eada2911-1eef-4447-83f5-48471463d5e3", "metadata": {}, "outputs": [], "source": [ "year = 2010\n", "query = f\"SELECT * FROM ticker WHERE strftime('%Y', ticker.date) = '{year}'\"" ] }, { "cell_type": "code", "execution_count": 40, "id": "4997978f-c1ea-4b51-b329-8d7fa97c1b53", "metadata": {}, "outputs": [], "source": [ "df = pd.read_sql(query, db.engine)\n", "df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S.%f')" ] }, { "cell_type": "code", "execution_count": 41, "id": "796a5ed7-1255-4e6b-aa1d-28a3cedfc71e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "date | \n", "open | \n", "high | \n", "low | \n", "close | \n", "volume | \n", "
---|---|---|---|---|---|---|---|
count | \n", "1044000.000 | \n", "1044000 | \n", "1044000.000 | \n", "1044000.000 | \n", "1044000.000 | \n", "1044000.000 | \n", "1044000.000 | \n", "
mean | \n", "12521609.500 | \n", "2010-07-02 19:07:35.172413440 | \n", "249.979 | \n", "263.009 | \n", "235.933 | \n", "249.988 | \n", "1075619.915 | \n", "
min | \n", "2610.000 | \n", "2010-01-01 00:00:00 | \n", "0.985 | \n", "1.079 | \n", "0.936 | \n", "1.034 | \n", "49077.000 | \n", "
25% | \n", "6262109.750 | \n", "2010-04-02 00:00:00 | \n", "86.348 | \n", "90.865 | \n", "81.526 | \n", "86.464 | \n", "564369.000 | \n", "
50% | \n", "12521609.500 | \n", "2010-07-02 00:00:00 | \n", "181.516 | \n", "191.091 | \n", "171.390 | \n", "181.713 | \n", "1087653.000 | \n", "
75% | \n", "18781109.250 | \n", "2010-10-01 00:00:00 | \n", "336.361 | \n", "353.789 | \n", "317.443 | \n", "336.434 | \n", "1581562.500 | \n", "
max | \n", "25040609.000 | \n", "2010-12-31 00:00:00 | \n", "2520.380 | \n", "2631.542 | \n", "2335.396 | \n", "2461.570 | \n", "2299658.000 | \n", "
std | \n", "7229583.308 | \n", "NaN | \n", "237.887 | \n", "250.233 | \n", "224.430 | \n", "237.659 | \n", "591062.757 | \n", "