exo2 / model.py
Edmilson Alexandre
done
44c667f
import os
import numpy
import pandas
import joblib
import optuna
import tsfresh
import sklearn
import seaborn
import lightgbm
import matplotlib
from google.colab import drive
def EDA(file):
pandas.set_option('display.max_columns', 200)
target_col = 'target'
print("Shape from file: ", file.shape)
# display(file.head())
# print(file.dtypes)
def PipelineCreation(file, target_col):
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
id_cols = [c for c in ['id', 'time', 'index'] if c in file.columns]
x = file.drop(columns=[target_col] + id_cols, errors='ignore')
y = file[target_col]
numeric_features = x.select_dtypes(include=['number']).columns.tolist()
categorical_features = x.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
# print("Numeric: ", len(numeric_features))
# print("Categorical: ", len(categorical_features))
# print("X: ", x)
# print("Y: ", y)
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),])
#categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
# ('ohe', OneHotEncoder(handle_unknown='ignore')),])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])
print(f"Num:{len(numeric_features)}")
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
], remainder='drop')
return x, y, preprocessor
def Training(x_train_clean, y_train, x_val_clean, y_val):
# from lightgbm import LGBMClassifier
# scale_pos_weight = (y_train==0).sum() / max(1, (y_train==1).sum())
# model = LGBMClassifier(
# objective='binary',
# n_estimators=10000,
# learning_rate=0.05,
# num_leaves=31,
# random_state=42,
# scale_pos_weight=scale_pos_weight
# )
# model.fit(
# x_train_tr, y_train,
# eval_set=[(x_val_tr, y_val)],
# eval_metric='auc',
# #my version does not support that method. I need to use callbacks
# #early_stopping_rounds=100,
# #callbacks=[early_stopping(100), log_evaluation(100)]
# )
# print("Best iteration:", model.best_iteration_)
# print("Train AUC:", model.best_score_['training']['auc'])
# print("Valid AUC:", model.best_score_['valid_0']['auc'])
from lightgbm import LGBMClassifier
# model = LGBMClassifier(
# objective='binary',
# n_estimators=2000,
# learning_rate=0.05,
# num_leaves=31,
# min_child_samples=1,
# random_state=42
# )
model = LGBMClassifier(
objective='binary',
n_estimators=2000,
learning_rate=0.05,
num_leaves=31,
min_child_samples=1,
min_split_gain=0.0, # permite splits mesmo sem ganho positivo aparente
min_data_in_leaf=1, # remove restrição mínima
random_state=42
)
model.fit(
x_train_clean, y_train,
eval_set=[(x_val_clean, y_val)],
eval_metric='auc',
)
print("Best iteration:", model.best_iteration_)
return model
from sklearn.model_selection import train_test_split
if os.path.exists('/content/drive') == 0:
drive.mount('/content/drive')
labels = pandas.read_csv('/content/drive/MyDrive/AI_assets/labels.csv')
light_curves = pandas.read_csv('/content/drive/MyDrive/AI_assets/light_curves.csv')
metadata = pandas.read_csv('/content/drive/MyDrive/AI_assets/metadata.csv')
#data = pandas.read_csv('/content/drive/MyDrive/exoplanets_normalized_data.csv')
data = pandas.read_csv('/content/drive/MyDrive/data.csv')
# im gonna change this under cause i nedd more classes. Using binary interpretation insted of 'CONFIRMED' will help a lot
#x, y, preprocessor = PipelineCreation(data, target_col='kepoi_name')
data['target'] = data['koi_disposition'].map(
lambda v: 1 if v == "CONFIRMED" else 0
)
x, y, preprocessor = PipelineCreation(data, target_col='target')
print("First step done. 1 -> PIPELINE CREATION")
# print("X: ", x)
# print("Y: ", y)
x_train, x_val, y_train, y_val = train_test_split(
x, y, test_size=0.20, stratify=y, random_state=42
)
preprocessor.fit(x_train)
x_train_tr = preprocessor.transform(x_train)
x_val_tr = preprocessor.transform(x_val)
EDA(labels)
#debug purposes
#print("Second step done. 2 -> EDA DONE")
#print("X_train shape:", x_train_tr.shape)
#print("X_val shape:", x_val_tr.shape)
#print("y_train distribution:\n", y_train.value_counts())
#print("y_val distribution:\n", y_val.value_counts())
#print("Check for NaNs:", x_train_tr.isna().sum().sum(), "in train,", x_val_tr.isna().sum().sum(), "in val")
import pandas as pd
x_train_df = pd.DataFrame(x_train_tr)
x_val_df = pd.DataFrame(x_val_tr)
# Remover colunas sem variância
valid_cols = x_train_df.columns[x_train_df.var() > 0]
x_train_clean = x_train_df[valid_cols]
x_val_clean = x_val_df[valid_cols]
import numpy as np
import pandas as pd
# X_train é seu conjunto de treino (DataFrame)
variancias = x_train_df.var()
sem_variancia = (variancias == 0).sum()
# print(f"Número de colunas com variância zero: {sem_variancia}/{len(variancias)}")
# print("Exemplo de variâncias não nulas:")
# print(variancias[variancias > 0].head())
# print("Número de colunas após filtragem:", len(valid_cols))
# print("Shape final de treino:", x_train_clean.shape)
# print("Distribuição de classes:")
# print(y_train.value_counts(normalize=True))
# print("Exemplo de valores:")
# print(x_train_clean.head())
# print("Tipo de dados:", x_train_clean.dtypes.unique())
# print("Faixa de valores nas primeiras colunas:")
# print(x_train_clean.iloc[:, :5].describe())
# import numpy as np
# print("Min:", np.min(x_train_clean.values))
# print("Max:", np.max(x_train_clean.values))
# print("Mean:", np.mean(x_train_clean.values))
# print("Std:", np.std(x_train_clean.values))
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from lightgbm import LGBMClassifier
# === 1. Remover colunas quase constantes ===
var_sel = VarianceThreshold(threshold=1e-4)
x_train_filtered = var_sel.fit_transform(x_train_clean)
x_val_filtered = var_sel.transform(x_val_clean)
#print("Shape após VarianceThreshold:", x_train_filtered.shape)
# === 2. Padronizar ===
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_filtered)
x_val_scaled = scaler.transform(x_val_filtered)
# === 3. Treinar modelo ===
# model = LGBMClassifier(
# objective='binary',
# n_estimators=2000,
# learning_rate=0.05,
# num_leaves=31,
# random_state=42
# )
# model.fit(
# x_train_scaled, y_train,
# eval_set=[(x_val_scaled, y_val)],
# eval_metric='auc'
# )
# print("Best iteration:", model.best_iteration_)
#model = Training(x_train_clean, y_train, x_val_clean, y_val)
#print("Finished Training. 3!!")
# import numpy as np
# import pandas as pd
# # Converter se ainda estiver em numpy array
# x_train_df = pd.DataFrame(x_train_clean)
# # 1️⃣ Correlação média entre cada feature e o target
# corrs = []
# for col in x_train_df.columns:
# try:
# corrs.append(abs(np.corrcoef(x_train_df[col], y_train)[0,1]))
# except:
# corrs.append(0)
# mean_corr = np.nanmean(corrs)
# print("Correlação média com o target:", mean_corr)
# # 2️⃣ Contar quantas colunas têm correlação > 0.05
# useful = np.sum(np.array(corrs) > 0.05)
# print("Features com correlação > 0.05:", useful, "/", len(corrs))
# A litle degub made by GPT
# print("Best iteration:", model.best_iteration_)
# print("Train AUC:", model.best_score_['training']['auc'])
# print("Valid AUC:", model.best_score_['valid_0']['auc'])
import numpy as np
if hasattr(x_train_clean, "todense"):
x_train_clean = np.array(x_train_clean.todense(), dtype=np.float32)
x_val_clean = np.array(x_val_clean.todense(), dtype=np.float32)
else:
x_train_clean = np.array(x_train_clean, dtype=np.float32)
x_val_clean = np.array(x_val_clean, dtype=np.float32)
print("Treino - tipo:", type(x_train_clean))
print("Treino - shape:", x_train_clean.shape)
print("Treino - dtype:", x_train_clean.dtype)
print("Valores únicos de y:", np.unique(y_train, return_counts=True))
#model = Training(x_train_clean, y_train, x_val_clean, y_val)
from lightgbm import LGBMClassifier
model = LGBMClassifier(
objective='binary',
learning_rate=0.01,
n_estimators=2000,
random_state=42
)
model.fit(x_train_clean, y_train)
train_score = model.score(x_train_clean, y_train)
print("Treino score:", train_score)
from sklearn.metrics import accuracy_score
y_pred = model.predict(x_val_clean)
acc = accuracy_score(y_val, y_pred)
print(f"Acurácia: {acc:.4f}")