Spaces:
Sleeping
Sleeping
Update common.py
Browse files
common.py
CHANGED
|
@@ -1,22 +1,26 @@
|
|
| 1 |
import re
|
| 2 |
import json
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
| 5 |
-
from typing import Optional, Tuple
|
| 6 |
from statsmodels.tsa.holtwinters import ExponentialSmoothing, Holt
|
| 7 |
|
| 8 |
try:
|
| 9 |
from prophet import Prophet
|
|
|
|
| 10 |
_HAS_PROPHET = True
|
| 11 |
except Exception:
|
| 12 |
_HAS_PROPHET = False
|
| 13 |
|
| 14 |
_KEEP = re.compile(r"[^А-Яа-яЁё0-9 ,.!?:;()«»\"'–—\-•\n]")
|
| 15 |
|
|
|
|
| 16 |
def clean_ru(text: str) -> str:
|
| 17 |
text = _KEEP.sub(" ", text or "")
|
| 18 |
return re.sub(r"\s+", " ", text).strip()
|
| 19 |
|
|
|
|
| 20 |
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 21 |
work = df.copy()
|
| 22 |
for col in list(work.columns):
|
|
@@ -29,50 +33,82 @@ def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 29 |
work.rename(columns={col: "category"}, inplace=True)
|
| 30 |
elif lc in ("type", "тип"):
|
| 31 |
work.rename(columns={col: "type"}, inplace=True)
|
|
|
|
| 32 |
required = {"date", "amount", "type"}
|
| 33 |
missing = required - set(map(str, work.columns))
|
| 34 |
if missing:
|
| 35 |
raise ValueError(f"Отсутствуют колонки: {', '.join(sorted(missing))}")
|
|
|
|
| 36 |
work["date"] = pd.to_datetime(work["date"], errors="coerce")
|
| 37 |
work = work.dropna(subset=["date"])
|
|
|
|
| 38 |
work["amount"] = pd.to_numeric(work["amount"], errors="coerce").fillna(0.0)
|
|
|
|
| 39 |
if "category" not in work.columns:
|
| 40 |
work["category"] = "Без категории"
|
|
|
|
| 41 |
return work
|
| 42 |
|
|
|
|
| 43 |
def is_expense(t: str) -> bool:
|
| 44 |
t = str(t).strip().lower()
|
| 45 |
return t in {"expense", "расход", "расходы", "-", "e", "exp"}
|
| 46 |
|
|
|
|
| 47 |
def is_income(t: str) -> bool:
|
| 48 |
t = str(t).strip().lower()
|
| 49 |
return t in {"income", "доход", "+", "i", "inc"}
|
| 50 |
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
if df is None or df.empty:
|
| 53 |
raise ValueError("Пустая таблица транзакций.")
|
|
|
|
| 54 |
work = normalize_columns(df)
|
| 55 |
work["is_expense"] = work["type"].apply(is_expense)
|
| 56 |
work["is_income"] = work["type"].apply(is_income)
|
| 57 |
|
| 58 |
-
inc =
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
if not inc.empty or not exp.empty:
|
| 62 |
start = min([x.index.min() for x in [inc, exp] if not x.empty])
|
| 63 |
-
end
|
| 64 |
full_idx = pd.date_range(start, end, freq=freq)
|
| 65 |
inc = inc.reindex(full_idx, fill_value=0.0)
|
| 66 |
exp = exp.reindex(full_idx, fill_value=0.0)
|
|
|
|
| 67 |
net = inc + exp
|
| 68 |
inc.index.name = exp.index.name = net.index.name = "period_end"
|
| 69 |
return inc, exp, net
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
if len(history) < 3:
|
| 73 |
last = float(history.iloc[-1]) if len(history) else 0.0
|
| 74 |
-
start = (
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
| 76 |
idx = pd.date_range(start, periods=steps, freq=freq)
|
| 77 |
return pd.Series([last] * steps, index=idx, name="forecast")
|
| 78 |
|
|
@@ -80,9 +116,9 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
|
|
| 80 |
if method == "prophet":
|
| 81 |
use_prophet = True
|
| 82 |
elif method == "auto":
|
| 83 |
-
if freq.startswith("A"):
|
| 84 |
use_prophet = _HAS_PROPHET and (len(history) >= 5)
|
| 85 |
-
else:
|
| 86 |
use_prophet = _HAS_PROPHET and (len(history) >= 18)
|
| 87 |
|
| 88 |
if use_prophet:
|
|
@@ -100,7 +136,11 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
|
|
| 100 |
m.fit(dfp)
|
| 101 |
future = m.make_future_dataframe(periods=steps, freq=pfreq)
|
| 102 |
fcst = m.predict(future).tail(steps)
|
| 103 |
-
yhat = pd.Series(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
if pfreq == "M":
|
| 106 |
yhat.index = yhat.index.to_period("M").to_timestamp(how="end")
|
|
@@ -108,24 +148,34 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
|
|
| 108 |
yhat.index = yhat.index.to_period("Y").to_timestamp(how="end")
|
| 109 |
|
| 110 |
if yhat.index.freq is None:
|
| 111 |
-
yhat.index = pd.date_range(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
return yhat
|
| 113 |
except Exception:
|
| 114 |
pass
|
| 115 |
|
|
|
|
| 116 |
try:
|
| 117 |
if freq.startswith("A"):
|
| 118 |
model = Holt(history, initialization_method="estimated")
|
| 119 |
else:
|
| 120 |
if len(history) >= 24:
|
| 121 |
model = ExponentialSmoothing(
|
| 122 |
-
history,
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
| 124 |
)
|
| 125 |
else:
|
| 126 |
model = Holt(history, initialization_method="estimated")
|
|
|
|
| 127 |
fit = model.fit(optimized=True)
|
| 128 |
fc = fit.forecast(steps)
|
|
|
|
| 129 |
if not isinstance(fc.index, pd.DatetimeIndex) or len(fc.index) != steps:
|
| 130 |
start = history.index[-1] + pd.tseries.frequencies.to_offset(freq)
|
| 131 |
idx = pd.date_range(start, periods=steps, freq=freq)
|
|
@@ -138,38 +188,52 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
|
|
| 138 |
idx = pd.date_range(start, periods=steps, freq=freq)
|
| 139 |
return pd.Series([baseline] * steps, index=idx, name="forecast")
|
| 140 |
|
|
|
|
| 141 |
def current_month_snapshot(df: pd.DataFrame) -> dict:
|
| 142 |
if df is None or df.empty:
|
| 143 |
return {}
|
| 144 |
w = normalize_columns(df)
|
| 145 |
w["is_income"] = w["type"].apply(is_income)
|
| 146 |
w["is_expense"] = w["type"].apply(is_expense)
|
|
|
|
| 147 |
lastp = w["date"].dt.to_period("M").max()
|
| 148 |
cur = w[w["date"].dt.to_period("M") == lastp].copy()
|
| 149 |
if cur.empty:
|
| 150 |
return {}
|
| 151 |
-
|
|
|
|
| 152 |
expense_total = -float(cur.loc[cur["is_expense"], "amount"].abs().sum())
|
| 153 |
net = income_total + expense_total
|
| 154 |
-
|
|
|
|
| 155 |
exp_df["amount"] = -exp_df["amount"].abs()
|
| 156 |
-
top =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
return {
|
| 158 |
"month": str(lastp),
|
| 159 |
"income_total": income_total,
|
| 160 |
"expense_total": expense_total,
|
| 161 |
"net": net,
|
| 162 |
-
"top_expense_categories": [
|
|
|
|
|
|
|
| 163 |
}
|
| 164 |
|
|
|
|
| 165 |
def read_json_stdin() -> dict:
|
| 166 |
import sys
|
|
|
|
| 167 |
raw = sys.stdin.read()
|
| 168 |
return json.loads(raw or "{}")
|
| 169 |
|
|
|
|
| 170 |
def write_json_stdout(obj) -> None:
|
| 171 |
-
sys.stdout.reconfigure(encoding="utf-8")
|
| 172 |
-
import json
|
| 173 |
import sys
|
|
|
|
| 174 |
sys.stdout.write(json.dumps(obj, ensure_ascii=False))
|
| 175 |
sys.stdout.flush()
|
|
|
|
| 1 |
import re
|
| 2 |
import json
|
| 3 |
+
from typing import Optional, Tuple
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
|
|
|
| 7 |
from statsmodels.tsa.holtwinters import ExponentialSmoothing, Holt
|
| 8 |
|
| 9 |
try:
|
| 10 |
from prophet import Prophet
|
| 11 |
+
|
| 12 |
_HAS_PROPHET = True
|
| 13 |
except Exception:
|
| 14 |
_HAS_PROPHET = False
|
| 15 |
|
| 16 |
_KEEP = re.compile(r"[^А-Яа-яЁё0-9 ,.!?:;()«»\"'–—\-•\n]")
|
| 17 |
|
| 18 |
+
|
| 19 |
def clean_ru(text: str) -> str:
|
| 20 |
text = _KEEP.sub(" ", text or "")
|
| 21 |
return re.sub(r"\s+", " ", text).strip()
|
| 22 |
|
| 23 |
+
|
| 24 |
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 25 |
work = df.copy()
|
| 26 |
for col in list(work.columns):
|
|
|
|
| 33 |
work.rename(columns={col: "category"}, inplace=True)
|
| 34 |
elif lc in ("type", "тип"):
|
| 35 |
work.rename(columns={col: "type"}, inplace=True)
|
| 36 |
+
|
| 37 |
required = {"date", "amount", "type"}
|
| 38 |
missing = required - set(map(str, work.columns))
|
| 39 |
if missing:
|
| 40 |
raise ValueError(f"Отсутствуют колонки: {', '.join(sorted(missing))}")
|
| 41 |
+
|
| 42 |
work["date"] = pd.to_datetime(work["date"], errors="coerce")
|
| 43 |
work = work.dropna(subset=["date"])
|
| 44 |
+
|
| 45 |
work["amount"] = pd.to_numeric(work["amount"], errors="coerce").fillna(0.0)
|
| 46 |
+
|
| 47 |
if "category" not in work.columns:
|
| 48 |
work["category"] = "Без категории"
|
| 49 |
+
|
| 50 |
return work
|
| 51 |
|
| 52 |
+
|
| 53 |
def is_expense(t: str) -> bool:
|
| 54 |
t = str(t).strip().lower()
|
| 55 |
return t in {"expense", "расход", "расходы", "-", "e", "exp"}
|
| 56 |
|
| 57 |
+
|
| 58 |
def is_income(t: str) -> bool:
|
| 59 |
t = str(t).strip().lower()
|
| 60 |
return t in {"income", "доход", "+", "i", "inc"}
|
| 61 |
|
| 62 |
+
|
| 63 |
+
def prepare_components_series(
|
| 64 |
+
df: pd.DataFrame, freq: str = "M"
|
| 65 |
+
) -> Tuple[pd.Series, pd.Series, pd.Series]:
|
| 66 |
if df is None or df.empty:
|
| 67 |
raise ValueError("Пустая таблица транзакций.")
|
| 68 |
+
|
| 69 |
work = normalize_columns(df)
|
| 70 |
work["is_expense"] = work["type"].apply(is_expense)
|
| 71 |
work["is_income"] = work["type"].apply(is_income)
|
| 72 |
|
| 73 |
+
inc = (
|
| 74 |
+
work.loc[work["is_income"]]
|
| 75 |
+
.set_index("date")["amount"]
|
| 76 |
+
.resample(freq)
|
| 77 |
+
.sum()
|
| 78 |
+
.sort_index()
|
| 79 |
+
)
|
| 80 |
+
exp = (
|
| 81 |
+
work.loc[work["is_expense"]]
|
| 82 |
+
.set_index("date")["amount"]
|
| 83 |
+
.abs()
|
| 84 |
+
.mul(-1)
|
| 85 |
+
.resample(freq)
|
| 86 |
+
.sum()
|
| 87 |
+
.sort_index()
|
| 88 |
+
)
|
| 89 |
|
| 90 |
if not inc.empty or not exp.empty:
|
| 91 |
start = min([x.index.min() for x in [inc, exp] if not x.empty])
|
| 92 |
+
end = max([x.index.max() for x in [inc, exp] if not x.empty])
|
| 93 |
full_idx = pd.date_range(start, end, freq=freq)
|
| 94 |
inc = inc.reindex(full_idx, fill_value=0.0)
|
| 95 |
exp = exp.reindex(full_idx, fill_value=0.0)
|
| 96 |
+
|
| 97 |
net = inc + exp
|
| 98 |
inc.index.name = exp.index.name = net.index.name = "period_end"
|
| 99 |
return inc, exp, net
|
| 100 |
|
| 101 |
+
|
| 102 |
+
def fit_and_forecast(
|
| 103 |
+
history: pd.Series, steps: int, freq: str, method: str = "auto"
|
| 104 |
+
) -> pd.Series:
|
| 105 |
if len(history) < 3:
|
| 106 |
last = float(history.iloc[-1]) if len(history) else 0.0
|
| 107 |
+
start = (
|
| 108 |
+
history.index[-1]
|
| 109 |
+
if len(history)
|
| 110 |
+
else pd.Timestamp.today().normalize()
|
| 111 |
+
) + pd.tseries.frequencies.to_offset(freq)
|
| 112 |
idx = pd.date_range(start, periods=steps, freq=freq)
|
| 113 |
return pd.Series([last] * steps, index=idx, name="forecast")
|
| 114 |
|
|
|
|
| 116 |
if method == "prophet":
|
| 117 |
use_prophet = True
|
| 118 |
elif method == "auto":
|
| 119 |
+
if freq.startswith("A"): # годовая
|
| 120 |
use_prophet = _HAS_PROPHET and (len(history) >= 5)
|
| 121 |
+
else: # месячная
|
| 122 |
use_prophet = _HAS_PROPHET and (len(history) >= 18)
|
| 123 |
|
| 124 |
if use_prophet:
|
|
|
|
| 136 |
m.fit(dfp)
|
| 137 |
future = m.make_future_dataframe(periods=steps, freq=pfreq)
|
| 138 |
fcst = m.predict(future).tail(steps)
|
| 139 |
+
yhat = pd.Series(
|
| 140 |
+
fcst["yhat"].values,
|
| 141 |
+
index=pd.DatetimeIndex(fcst["ds"]),
|
| 142 |
+
name="forecast",
|
| 143 |
+
)
|
| 144 |
|
| 145 |
if pfreq == "M":
|
| 146 |
yhat.index = yhat.index.to_period("M").to_timestamp(how="end")
|
|
|
|
| 148 |
yhat.index = yhat.index.to_period("Y").to_timestamp(how="end")
|
| 149 |
|
| 150 |
if yhat.index.freq is None:
|
| 151 |
+
yhat.index = pd.date_range(
|
| 152 |
+
yhat.index[0],
|
| 153 |
+
periods=len(yhat),
|
| 154 |
+
freq=("A-DEC" if pfreq == "Y" else "M"),
|
| 155 |
+
)
|
| 156 |
return yhat
|
| 157 |
except Exception:
|
| 158 |
pass
|
| 159 |
|
| 160 |
+
# Holt / Holt-Winters
|
| 161 |
try:
|
| 162 |
if freq.startswith("A"):
|
| 163 |
model = Holt(history, initialization_method="estimated")
|
| 164 |
else:
|
| 165 |
if len(history) >= 24:
|
| 166 |
model = ExponentialSmoothing(
|
| 167 |
+
history,
|
| 168 |
+
trend="add",
|
| 169 |
+
seasonal="add",
|
| 170 |
+
seasonal_periods=12,
|
| 171 |
+
initialization_method="estimated",
|
| 172 |
)
|
| 173 |
else:
|
| 174 |
model = Holt(history, initialization_method="estimated")
|
| 175 |
+
|
| 176 |
fit = model.fit(optimized=True)
|
| 177 |
fc = fit.forecast(steps)
|
| 178 |
+
|
| 179 |
if not isinstance(fc.index, pd.DatetimeIndex) or len(fc.index) != steps:
|
| 180 |
start = history.index[-1] + pd.tseries.frequencies.to_offset(freq)
|
| 181 |
idx = pd.date_range(start, periods=steps, freq=freq)
|
|
|
|
| 188 |
idx = pd.date_range(start, periods=steps, freq=freq)
|
| 189 |
return pd.Series([baseline] * steps, index=idx, name="forecast")
|
| 190 |
|
| 191 |
+
|
| 192 |
def current_month_snapshot(df: pd.DataFrame) -> dict:
|
| 193 |
if df is None or df.empty:
|
| 194 |
return {}
|
| 195 |
w = normalize_columns(df)
|
| 196 |
w["is_income"] = w["type"].apply(is_income)
|
| 197 |
w["is_expense"] = w["type"].apply(is_expense)
|
| 198 |
+
|
| 199 |
lastp = w["date"].dt.to_period("M").max()
|
| 200 |
cur = w[w["date"].dt.to_period("M") == lastp].copy()
|
| 201 |
if cur.empty:
|
| 202 |
return {}
|
| 203 |
+
|
| 204 |
+
income_total = float(cur.loc[cur["is_income"], "amount"].sum())
|
| 205 |
expense_total = -float(cur.loc[cur["is_expense"], "amount"].abs().sum())
|
| 206 |
net = income_total + expense_total
|
| 207 |
+
|
| 208 |
+
exp_df = cur.loc[cur["is_expense"], ["category", "amount"]].copy()
|
| 209 |
exp_df["amount"] = -exp_df["amount"].abs()
|
| 210 |
+
top = (
|
| 211 |
+
exp_df.groupby("category")["amount"]
|
| 212 |
+
.sum()
|
| 213 |
+
.sort_values()
|
| 214 |
+
.head(5)
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
return {
|
| 218 |
"month": str(lastp),
|
| 219 |
"income_total": income_total,
|
| 220 |
"expense_total": expense_total,
|
| 221 |
"net": net,
|
| 222 |
+
"top_expense_categories": [
|
| 223 |
+
(str(k), float(v)) for k, v in top.items()
|
| 224 |
+
],
|
| 225 |
}
|
| 226 |
|
| 227 |
+
|
| 228 |
def read_json_stdin() -> dict:
|
| 229 |
import sys
|
| 230 |
+
|
| 231 |
raw = sys.stdin.read()
|
| 232 |
return json.loads(raw or "{}")
|
| 233 |
|
| 234 |
+
|
| 235 |
def write_json_stdout(obj) -> None:
|
|
|
|
|
|
|
| 236 |
import sys
|
| 237 |
+
|
| 238 |
sys.stdout.write(json.dumps(obj, ensure_ascii=False))
|
| 239 |
sys.stdout.flush()
|