exo / model.py
Edmilson Alexandre
done
266d9b4
raw
history blame
8.93 kB
import os
import numpy
import pandas
import joblib
import optuna
import tsfresh
import sklearn
import seaborn
import lightgbm
import matplotlib
from google.colab import drive
def EDA(file):
pandas.set_option('display.max_columns', 200)
target_col = 'target'
print("Shape from file: ", file.shape)
# display(file.head())
# print(file.dtypes)
def PipelineCreation(file, target_col):
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
id_cols = [c for c in ['id', 'time', 'index'] if c in file.columns]
x = file.drop(columns=[target_col] + id_cols, errors='ignore')
y = file[target_col]
numeric_features = x.select_dtypes(include=['number']).columns.tolist()
categorical_features = x.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
# print("Numeric: ", len(numeric_features))
# print("Categorical: ", len(categorical_features))
# print("X: ", x)
# print("Y: ", y)
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),])
#categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
# ('ohe', OneHotEncoder(handle_unknown='ignore')),])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])
print(f"Num:{len(numeric_features)}")
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
], remainder='drop')
return x, y, preprocessor
def Training(x_train_clean, y_train, x_val_clean, y_val):
# from lightgbm import LGBMClassifier
# scale_pos_weight = (y_train==0).sum() / max(1, (y_train==1).sum())
# model = LGBMClassifier(
# objective='binary',
# n_estimators=10000,
# learning_rate=0.05,
# num_leaves=31,
# random_state=42,
# scale_pos_weight=scale_pos_weight
# )
# model.fit(
# x_train_tr, y_train,
# eval_set=[(x_val_tr, y_val)],
# eval_metric='auc',
# #my version does not support that method. I need to use callbacks
# #early_stopping_rounds=100,
# #callbacks=[early_stopping(100), log_evaluation(100)]
# )
# print("Best iteration:", model.best_iteration_)
# print("Train AUC:", model.best_score_['training']['auc'])
# print("Valid AUC:", model.best_score_['valid_0']['auc'])
from lightgbm import LGBMClassifier
# model = LGBMClassifier(
# objective='binary',
# n_estimators=2000,
# learning_rate=0.05,
# num_leaves=31,
# min_child_samples=1,
# random_state=42
# )
model = LGBMClassifier(
objective='binary',
n_estimators=2000,
learning_rate=0.05,
num_leaves=31,
min_child_samples=1,
min_split_gain=0.0, # permite splits mesmo sem ganho positivo aparente
min_data_in_leaf=1, # remove restrição mínima
random_state=42
)
model.fit(
x_train_clean, y_train,
eval_set=[(x_val_clean, y_val)],
eval_metric='auc',
)
print("Best iteration:", model.best_iteration_)
return model
from sklearn.model_selection import train_test_split
if os.path.exists('/content/drive') == 0:
drive.mount('/content/drive')
labels = pandas.read_csv('/content/drive/MyDrive/AI_assets/labels.csv')
light_curves = pandas.read_csv('/content/drive/MyDrive/AI_assets/light_curves.csv')
metadata = pandas.read_csv('/content/drive/MyDrive/AI_assets/metadata.csv')
#data = pandas.read_csv('/content/drive/MyDrive/exoplanets_normalized_data.csv')
data = pandas.read_csv('/content/drive/MyDrive/data.csv')
# im gonna change this under cause i nedd more classes. Using binary interpretation insted of 'CONFIRMED' will help a lot
#x, y, preprocessor = PipelineCreation(data, target_col='kepoi_name')
data['target'] = data['koi_disposition'].map(
lambda v: 1 if v == "CONFIRMED" else 0
)
x, y, preprocessor = PipelineCreation(data, target_col='target')
print("First step done. 1 -> PIPELINE CREATION")
# print("X: ", x)
# print("Y: ", y)
x_train, x_val, y_train, y_val = train_test_split(
x, y, test_size=0.20, stratify=y, random_state=42
)
preprocessor.fit(x_train)
x_train_tr = preprocessor.transform(x_train)
x_val_tr = preprocessor.transform(x_val)
EDA(labels)
#debug purposes
#print("Second step done. 2 -> EDA DONE")
#print("X_train shape:", x_train_tr.shape)
#print("X_val shape:", x_val_tr.shape)
#print("y_train distribution:\n", y_train.value_counts())
#print("y_val distribution:\n", y_val.value_counts())
#print("Check for NaNs:", x_train_tr.isna().sum().sum(), "in train,", x_val_tr.isna().sum().sum(), "in val")
import pandas as pd
x_train_df = pd.DataFrame(x_train_tr)
x_val_df = pd.DataFrame(x_val_tr)
# Remover colunas sem variância
valid_cols = x_train_df.columns[x_train_df.var() > 0]
x_train_clean = x_train_df[valid_cols]
x_val_clean = x_val_df[valid_cols]
import numpy as np
import pandas as pd
# X_train é seu conjunto de treino (DataFrame)
variancias = x_train_df.var()
sem_variancia = (variancias == 0).sum()
# print(f"Número de colunas com variância zero: {sem_variancia}/{len(variancias)}")
# print("Exemplo de variâncias não nulas:")
# print(variancias[variancias > 0].head())
# print("Número de colunas após filtragem:", len(valid_cols))
# print("Shape final de treino:", x_train_clean.shape)
# print("Distribuição de classes:")
# print(y_train.value_counts(normalize=True))
# print("Exemplo de valores:")
# print(x_train_clean.head())
# print("Tipo de dados:", x_train_clean.dtypes.unique())
# print("Faixa de valores nas primeiras colunas:")
# print(x_train_clean.iloc[:, :5].describe())
# import numpy as np
# print("Min:", np.min(x_train_clean.values))
# print("Max:", np.max(x_train_clean.values))
# print("Mean:", np.mean(x_train_clean.values))
# print("Std:", np.std(x_train_clean.values))
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from lightgbm import LGBMClassifier
# === 1. Remover colunas quase constantes ===
var_sel = VarianceThreshold(threshold=1e-4)
x_train_filtered = var_sel.fit_transform(x_train_clean)
x_val_filtered = var_sel.transform(x_val_clean)
#print("Shape após VarianceThreshold:", x_train_filtered.shape)
# === 2. Padronizar ===
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_filtered)
x_val_scaled = scaler.transform(x_val_filtered)
# === 3. Treinar modelo ===
# model = LGBMClassifier(
# objective='binary',
# n_estimators=2000,
# learning_rate=0.05,
# num_leaves=31,
# random_state=42
# )
# model.fit(
# x_train_scaled, y_train,
# eval_set=[(x_val_scaled, y_val)],
# eval_metric='auc'
# )
# print("Best iteration:", model.best_iteration_)
#model = Training(x_train_clean, y_train, x_val_clean, y_val)
#print("Finished Training. 3!!")
# import numpy as np
# import pandas as pd
# # Converter se ainda estiver em numpy array
# x_train_df = pd.DataFrame(x_train_clean)
# # 1️⃣ Correlação média entre cada feature e o target
# corrs = []
# for col in x_train_df.columns:
# try:
# corrs.append(abs(np.corrcoef(x_train_df[col], y_train)[0,1]))
# except:
# corrs.append(0)
# mean_corr = np.nanmean(corrs)
# print("Correlação média com o target:", mean_corr)
# # 2️⃣ Contar quantas colunas têm correlação > 0.05
# useful = np.sum(np.array(corrs) > 0.05)
# print("Features com correlação > 0.05:", useful, "/", len(corrs))
# A litle degub made by GPT
# print("Best iteration:", model.best_iteration_)
# print("Train AUC:", model.best_score_['training']['auc'])
# print("Valid AUC:", model.best_score_['valid_0']['auc'])
import numpy as np
if hasattr(x_train_clean, "todense"):
x_train_clean = np.array(x_train_clean.todense(), dtype=np.float32)
x_val_clean = np.array(x_val_clean.todense(), dtype=np.float32)
else:
x_train_clean = np.array(x_train_clean, dtype=np.float32)
x_val_clean = np.array(x_val_clean, dtype=np.float32)
print("Treino - tipo:", type(x_train_clean))
print("Treino - shape:", x_train_clean.shape)
print("Treino - dtype:", x_train_clean.dtype)
print("Valores únicos de y:", np.unique(y_train, return_counts=True))
#model = Training(x_train_clean, y_train, x_val_clean, y_val)
from lightgbm import LGBMClassifier
model = LGBMClassifier(
objective='binary',
learning_rate=0.01,
n_estimators=2000,
random_state=42
)
model.fit(x_train_clean, y_train)
train_score = model.score(x_train_clean, y_train)
print("Treino score:", train_score)
from sklearn.metrics import accuracy_score
y_pred = model.predict(x_val_clean)
acc = accuracy_score(y_val, y_pred)
print(f"Acurácia: {acc:.4f}")