minstradamus commited on
Commit
7fa9875
·
verified ·
1 Parent(s): f7ac86a

Update common.py

Browse files
Files changed (1) hide show
  1. common.py +84 -20
common.py CHANGED
@@ -1,22 +1,26 @@
1
  import re
2
  import json
 
 
3
  import numpy as np
4
  import pandas as pd
5
- from typing import Optional, Tuple
6
  from statsmodels.tsa.holtwinters import ExponentialSmoothing, Holt
7
 
8
  try:
9
  from prophet import Prophet
 
10
  _HAS_PROPHET = True
11
  except Exception:
12
  _HAS_PROPHET = False
13
 
14
  _KEEP = re.compile(r"[^А-Яа-яЁё0-9 ,.!?:;()«»\"'–—\-•\n]")
15
 
 
16
  def clean_ru(text: str) -> str:
17
  text = _KEEP.sub(" ", text or "")
18
  return re.sub(r"\s+", " ", text).strip()
19
 
 
20
  def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
21
  work = df.copy()
22
  for col in list(work.columns):
@@ -29,50 +33,82 @@ def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
29
  work.rename(columns={col: "category"}, inplace=True)
30
  elif lc in ("type", "тип"):
31
  work.rename(columns={col: "type"}, inplace=True)
 
32
  required = {"date", "amount", "type"}
33
  missing = required - set(map(str, work.columns))
34
  if missing:
35
  raise ValueError(f"Отсутствуют колонки: {', '.join(sorted(missing))}")
 
36
  work["date"] = pd.to_datetime(work["date"], errors="coerce")
37
  work = work.dropna(subset=["date"])
 
38
  work["amount"] = pd.to_numeric(work["amount"], errors="coerce").fillna(0.0)
 
39
  if "category" not in work.columns:
40
  work["category"] = "Без категории"
 
41
  return work
42
 
 
43
  def is_expense(t: str) -> bool:
44
  t = str(t).strip().lower()
45
  return t in {"expense", "расход", "расходы", "-", "e", "exp"}
46
 
 
47
  def is_income(t: str) -> bool:
48
  t = str(t).strip().lower()
49
  return t in {"income", "доход", "+", "i", "inc"}
50
 
51
- def prepare_components_series(df: pd.DataFrame, freq: str="M") -> Tuple[pd.Series, pd.Series, pd.Series]:
 
 
 
52
  if df is None or df.empty:
53
  raise ValueError("Пустая таблица транзакций.")
 
54
  work = normalize_columns(df)
55
  work["is_expense"] = work["type"].apply(is_expense)
56
  work["is_income"] = work["type"].apply(is_income)
57
 
58
- inc = work.loc[work["is_income"]].set_index("date")["amount"].resample(freq).sum().sort_index()
59
- exp = work.loc[work["is_expense"]].set_index("date")["amount"].abs().mul(-1).resample(freq).sum().sort_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  if not inc.empty or not exp.empty:
62
  start = min([x.index.min() for x in [inc, exp] if not x.empty])
63
- end = max([x.index.max() for x in [inc, exp] if not x.empty])
64
  full_idx = pd.date_range(start, end, freq=freq)
65
  inc = inc.reindex(full_idx, fill_value=0.0)
66
  exp = exp.reindex(full_idx, fill_value=0.0)
 
67
  net = inc + exp
68
  inc.index.name = exp.index.name = net.index.name = "period_end"
69
  return inc, exp, net
70
 
71
- def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "auto") -> pd.Series:
 
 
 
72
  if len(history) < 3:
73
  last = float(history.iloc[-1]) if len(history) else 0.0
74
- start = (history.index[-1] if len(history) else pd.Timestamp.today().normalize()) + \
75
- pd.tseries.frequencies.to_offset(freq)
 
 
 
76
  idx = pd.date_range(start, periods=steps, freq=freq)
77
  return pd.Series([last] * steps, index=idx, name="forecast")
78
 
@@ -80,9 +116,9 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
80
  if method == "prophet":
81
  use_prophet = True
82
  elif method == "auto":
83
- if freq.startswith("A"):
84
  use_prophet = _HAS_PROPHET and (len(history) >= 5)
85
- else:
86
  use_prophet = _HAS_PROPHET and (len(history) >= 18)
87
 
88
  if use_prophet:
@@ -100,7 +136,11 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
100
  m.fit(dfp)
101
  future = m.make_future_dataframe(periods=steps, freq=pfreq)
102
  fcst = m.predict(future).tail(steps)
103
- yhat = pd.Series(fcst["yhat"].values, index=pd.DatetimeIndex(fcst["ds"]), name="forecast")
 
 
 
 
104
 
105
  if pfreq == "M":
106
  yhat.index = yhat.index.to_period("M").to_timestamp(how="end")
@@ -108,24 +148,34 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
108
  yhat.index = yhat.index.to_period("Y").to_timestamp(how="end")
109
 
110
  if yhat.index.freq is None:
111
- yhat.index = pd.date_range(yhat.index[0], periods=len(yhat), freq=("A-DEC" if pfreq == "Y" else "M"))
 
 
 
 
112
  return yhat
113
  except Exception:
114
  pass
115
 
 
116
  try:
117
  if freq.startswith("A"):
118
  model = Holt(history, initialization_method="estimated")
119
  else:
120
  if len(history) >= 24:
121
  model = ExponentialSmoothing(
122
- history, trend="add", seasonal="add", seasonal_periods=12,
123
- initialization_method="estimated"
 
 
 
124
  )
125
  else:
126
  model = Holt(history, initialization_method="estimated")
 
127
  fit = model.fit(optimized=True)
128
  fc = fit.forecast(steps)
 
129
  if not isinstance(fc.index, pd.DatetimeIndex) or len(fc.index) != steps:
130
  start = history.index[-1] + pd.tseries.frequencies.to_offset(freq)
131
  idx = pd.date_range(start, periods=steps, freq=freq)
@@ -138,38 +188,52 @@ def fit_and_forecast(history: pd.Series, steps: int, freq: str, method: str = "a
138
  idx = pd.date_range(start, periods=steps, freq=freq)
139
  return pd.Series([baseline] * steps, index=idx, name="forecast")
140
 
 
141
  def current_month_snapshot(df: pd.DataFrame) -> dict:
142
  if df is None or df.empty:
143
  return {}
144
  w = normalize_columns(df)
145
  w["is_income"] = w["type"].apply(is_income)
146
  w["is_expense"] = w["type"].apply(is_expense)
 
147
  lastp = w["date"].dt.to_period("M").max()
148
  cur = w[w["date"].dt.to_period("M") == lastp].copy()
149
  if cur.empty:
150
  return {}
151
- income_total = float(cur.loc[cur["is_income"], "amount"].sum())
 
152
  expense_total = -float(cur.loc[cur["is_expense"], "amount"].abs().sum())
153
  net = income_total + expense_total
154
- exp_df = cur.loc[cur["is_expense"], ["category","amount"]].copy()
 
155
  exp_df["amount"] = -exp_df["amount"].abs()
156
- top = exp_df.groupby("category")["amount"].sum().sort_values().head(5)
 
 
 
 
 
 
157
  return {
158
  "month": str(lastp),
159
  "income_total": income_total,
160
  "expense_total": expense_total,
161
  "net": net,
162
- "top_expense_categories": [(str(k), float(v)) for k,v in top.items()]
 
 
163
  }
164
 
 
165
  def read_json_stdin() -> dict:
166
  import sys
 
167
  raw = sys.stdin.read()
168
  return json.loads(raw or "{}")
169
 
 
170
  def write_json_stdout(obj) -> None:
171
- sys.stdout.reconfigure(encoding="utf-8")
172
- import json
173
  import sys
 
174
  sys.stdout.write(json.dumps(obj, ensure_ascii=False))
175
  sys.stdout.flush()
 
1
  import re
2
  import json
3
+ from typing import Optional, Tuple
4
+
5
  import numpy as np
6
  import pandas as pd
 
7
  from statsmodels.tsa.holtwinters import ExponentialSmoothing, Holt
8
 
9
  try:
10
  from prophet import Prophet
11
+
12
  _HAS_PROPHET = True
13
  except Exception:
14
  _HAS_PROPHET = False
15
 
16
  _KEEP = re.compile(r"[^А-Яа-яЁё0-9 ,.!?:;()«»\"'–—\-•\n]")
17
 
18
+
19
  def clean_ru(text: str) -> str:
20
  text = _KEEP.sub(" ", text or "")
21
  return re.sub(r"\s+", " ", text).strip()
22
 
23
+
24
  def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
25
  work = df.copy()
26
  for col in list(work.columns):
 
33
  work.rename(columns={col: "category"}, inplace=True)
34
  elif lc in ("type", "тип"):
35
  work.rename(columns={col: "type"}, inplace=True)
36
+
37
  required = {"date", "amount", "type"}
38
  missing = required - set(map(str, work.columns))
39
  if missing:
40
  raise ValueError(f"Отсутствуют колонки: {', '.join(sorted(missing))}")
41
+
42
  work["date"] = pd.to_datetime(work["date"], errors="coerce")
43
  work = work.dropna(subset=["date"])
44
+
45
  work["amount"] = pd.to_numeric(work["amount"], errors="coerce").fillna(0.0)
46
+
47
  if "category" not in work.columns:
48
  work["category"] = "Без категории"
49
+
50
  return work
51
 
52
+
53
  def is_expense(t: str) -> bool:
54
  t = str(t).strip().lower()
55
  return t in {"expense", "расход", "расходы", "-", "e", "exp"}
56
 
57
+
58
  def is_income(t: str) -> bool:
59
  t = str(t).strip().lower()
60
  return t in {"income", "доход", "+", "i", "inc"}
61
 
62
+
63
+ def prepare_components_series(
64
+ df: pd.DataFrame, freq: str = "M"
65
+ ) -> Tuple[pd.Series, pd.Series, pd.Series]:
66
  if df is None or df.empty:
67
  raise ValueError("Пустая таблица транзакций.")
68
+
69
  work = normalize_columns(df)
70
  work["is_expense"] = work["type"].apply(is_expense)
71
  work["is_income"] = work["type"].apply(is_income)
72
 
73
+ inc = (
74
+ work.loc[work["is_income"]]
75
+ .set_index("date")["amount"]
76
+ .resample(freq)
77
+ .sum()
78
+ .sort_index()
79
+ )
80
+ exp = (
81
+ work.loc[work["is_expense"]]
82
+ .set_index("date")["amount"]
83
+ .abs()
84
+ .mul(-1)
85
+ .resample(freq)
86
+ .sum()
87
+ .sort_index()
88
+ )
89
 
90
  if not inc.empty or not exp.empty:
91
  start = min([x.index.min() for x in [inc, exp] if not x.empty])
92
+ end = max([x.index.max() for x in [inc, exp] if not x.empty])
93
  full_idx = pd.date_range(start, end, freq=freq)
94
  inc = inc.reindex(full_idx, fill_value=0.0)
95
  exp = exp.reindex(full_idx, fill_value=0.0)
96
+
97
  net = inc + exp
98
  inc.index.name = exp.index.name = net.index.name = "period_end"
99
  return inc, exp, net
100
 
101
+
102
+ def fit_and_forecast(
103
+ history: pd.Series, steps: int, freq: str, method: str = "auto"
104
+ ) -> pd.Series:
105
  if len(history) < 3:
106
  last = float(history.iloc[-1]) if len(history) else 0.0
107
+ start = (
108
+ history.index[-1]
109
+ if len(history)
110
+ else pd.Timestamp.today().normalize()
111
+ ) + pd.tseries.frequencies.to_offset(freq)
112
  idx = pd.date_range(start, periods=steps, freq=freq)
113
  return pd.Series([last] * steps, index=idx, name="forecast")
114
 
 
116
  if method == "prophet":
117
  use_prophet = True
118
  elif method == "auto":
119
+ if freq.startswith("A"): # годовая
120
  use_prophet = _HAS_PROPHET and (len(history) >= 5)
121
+ else: # месячная
122
  use_prophet = _HAS_PROPHET and (len(history) >= 18)
123
 
124
  if use_prophet:
 
136
  m.fit(dfp)
137
  future = m.make_future_dataframe(periods=steps, freq=pfreq)
138
  fcst = m.predict(future).tail(steps)
139
+ yhat = pd.Series(
140
+ fcst["yhat"].values,
141
+ index=pd.DatetimeIndex(fcst["ds"]),
142
+ name="forecast",
143
+ )
144
 
145
  if pfreq == "M":
146
  yhat.index = yhat.index.to_period("M").to_timestamp(how="end")
 
148
  yhat.index = yhat.index.to_period("Y").to_timestamp(how="end")
149
 
150
  if yhat.index.freq is None:
151
+ yhat.index = pd.date_range(
152
+ yhat.index[0],
153
+ periods=len(yhat),
154
+ freq=("A-DEC" if pfreq == "Y" else "M"),
155
+ )
156
  return yhat
157
  except Exception:
158
  pass
159
 
160
+ # Holt / Holt-Winters
161
  try:
162
  if freq.startswith("A"):
163
  model = Holt(history, initialization_method="estimated")
164
  else:
165
  if len(history) >= 24:
166
  model = ExponentialSmoothing(
167
+ history,
168
+ trend="add",
169
+ seasonal="add",
170
+ seasonal_periods=12,
171
+ initialization_method="estimated",
172
  )
173
  else:
174
  model = Holt(history, initialization_method="estimated")
175
+
176
  fit = model.fit(optimized=True)
177
  fc = fit.forecast(steps)
178
+
179
  if not isinstance(fc.index, pd.DatetimeIndex) or len(fc.index) != steps:
180
  start = history.index[-1] + pd.tseries.frequencies.to_offset(freq)
181
  idx = pd.date_range(start, periods=steps, freq=freq)
 
188
  idx = pd.date_range(start, periods=steps, freq=freq)
189
  return pd.Series([baseline] * steps, index=idx, name="forecast")
190
 
191
+
192
  def current_month_snapshot(df: pd.DataFrame) -> dict:
193
  if df is None or df.empty:
194
  return {}
195
  w = normalize_columns(df)
196
  w["is_income"] = w["type"].apply(is_income)
197
  w["is_expense"] = w["type"].apply(is_expense)
198
+
199
  lastp = w["date"].dt.to_period("M").max()
200
  cur = w[w["date"].dt.to_period("M") == lastp].copy()
201
  if cur.empty:
202
  return {}
203
+
204
+ income_total = float(cur.loc[cur["is_income"], "amount"].sum())
205
  expense_total = -float(cur.loc[cur["is_expense"], "amount"].abs().sum())
206
  net = income_total + expense_total
207
+
208
+ exp_df = cur.loc[cur["is_expense"], ["category", "amount"]].copy()
209
  exp_df["amount"] = -exp_df["amount"].abs()
210
+ top = (
211
+ exp_df.groupby("category")["amount"]
212
+ .sum()
213
+ .sort_values()
214
+ .head(5)
215
+ )
216
+
217
  return {
218
  "month": str(lastp),
219
  "income_total": income_total,
220
  "expense_total": expense_total,
221
  "net": net,
222
+ "top_expense_categories": [
223
+ (str(k), float(v)) for k, v in top.items()
224
+ ],
225
  }
226
 
227
+
228
  def read_json_stdin() -> dict:
229
  import sys
230
+
231
  raw = sys.stdin.read()
232
  return json.loads(raw or "{}")
233
 
234
+
235
  def write_json_stdout(obj) -> None:
 
 
236
  import sys
237
+
238
  sys.stdout.write(json.dumps(obj, ensure_ascii=False))
239
  sys.stdout.flush()