|
|
import os |
|
|
import numpy |
|
|
import pandas |
|
|
import joblib |
|
|
import optuna |
|
|
import tsfresh |
|
|
import sklearn |
|
|
import seaborn |
|
|
import lightgbm |
|
|
import matplotlib |
|
|
from google.colab import drive |
|
|
|
|
|
|
|
|
def EDA(file): |
|
|
pandas.set_option('display.max_columns', 200) |
|
|
target_col = 'target' |
|
|
print("Shape from file: ", file.shape) |
|
|
|
|
|
|
|
|
|
|
|
def PipelineCreation(file, target_col): |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.impute import SimpleImputer |
|
|
from sklearn.preprocessing import OneHotEncoder |
|
|
from sklearn.compose import ColumnTransformer |
|
|
|
|
|
id_cols = [c for c in ['id', 'time', 'index'] if c in file.columns] |
|
|
x = file.drop(columns=[target_col] + id_cols, errors='ignore') |
|
|
y = file[target_col] |
|
|
|
|
|
numeric_features = x.select_dtypes(include=['number']).columns.tolist() |
|
|
categorical_features = x.select_dtypes(include=['object', 'category', 'bool']).columns.tolist() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),]) |
|
|
|
|
|
|
|
|
|
|
|
categorical_transformer = Pipeline([ |
|
|
('imputer', SimpleImputer(strategy='most_frequent')), |
|
|
('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False)), |
|
|
]) |
|
|
|
|
|
print(f"Num:{len(numeric_features)}") |
|
|
preprocessor = ColumnTransformer([ |
|
|
('num', numeric_transformer, numeric_features), |
|
|
('cat', categorical_transformer, categorical_features), |
|
|
], remainder='drop') |
|
|
return x, y, preprocessor |
|
|
|
|
|
def Training(x_train_clean, y_train, x_val_clean, y_val): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from lightgbm import LGBMClassifier |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = LGBMClassifier( |
|
|
objective='binary', |
|
|
n_estimators=2000, |
|
|
learning_rate=0.05, |
|
|
num_leaves=31, |
|
|
min_child_samples=1, |
|
|
min_split_gain=0.0, |
|
|
min_data_in_leaf=1, |
|
|
random_state=42 |
|
|
) |
|
|
|
|
|
model.fit( |
|
|
x_train_clean, y_train, |
|
|
eval_set=[(x_val_clean, y_val)], |
|
|
eval_metric='auc', |
|
|
) |
|
|
|
|
|
print("Best iteration:", model.best_iteration_) |
|
|
|
|
|
|
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
|
if os.path.exists('/content/drive') == 0: |
|
|
drive.mount('/content/drive') |
|
|
|
|
|
labels = pandas.read_csv('/content/drive/MyDrive/AI_assets/labels.csv') |
|
|
light_curves = pandas.read_csv('/content/drive/MyDrive/AI_assets/light_curves.csv') |
|
|
metadata = pandas.read_csv('/content/drive/MyDrive/AI_assets/metadata.csv') |
|
|
|
|
|
data = pandas.read_csv('/content/drive/MyDrive/data.csv') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data['target'] = data['koi_disposition'].map( |
|
|
lambda v: 1 if v == "CONFIRMED" else 0 |
|
|
) |
|
|
|
|
|
x, y, preprocessor = PipelineCreation(data, target_col='target') |
|
|
print("First step done. 1 -> PIPELINE CREATION") |
|
|
|
|
|
|
|
|
x_train, x_val, y_train, y_val = train_test_split( |
|
|
x, y, test_size=0.20, stratify=y, random_state=42 |
|
|
) |
|
|
preprocessor.fit(x_train) |
|
|
x_train_tr = preprocessor.transform(x_train) |
|
|
x_val_tr = preprocessor.transform(x_val) |
|
|
EDA(labels) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
x_train_df = pd.DataFrame(x_train_tr) |
|
|
x_val_df = pd.DataFrame(x_val_tr) |
|
|
|
|
|
|
|
|
valid_cols = x_train_df.columns[x_train_df.var() > 0] |
|
|
x_train_clean = x_train_df[valid_cols] |
|
|
x_val_clean = x_val_df[valid_cols] |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
variancias = x_train_df.var() |
|
|
sem_variancia = (variancias == 0).sum() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from sklearn.feature_selection import VarianceThreshold |
|
|
from lightgbm import LGBMClassifier |
|
|
|
|
|
|
|
|
var_sel = VarianceThreshold(threshold=1e-4) |
|
|
x_train_filtered = var_sel.fit_transform(x_train_clean) |
|
|
x_val_filtered = var_sel.transform(x_val_clean) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scaler = StandardScaler() |
|
|
x_train_scaled = scaler.fit_transform(x_train_filtered) |
|
|
x_val_scaled = scaler.transform(x_val_filtered) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
|
|
if hasattr(x_train_clean, "todense"): |
|
|
x_train_clean = np.array(x_train_clean.todense(), dtype=np.float32) |
|
|
x_val_clean = np.array(x_val_clean.todense(), dtype=np.float32) |
|
|
else: |
|
|
x_train_clean = np.array(x_train_clean, dtype=np.float32) |
|
|
x_val_clean = np.array(x_val_clean, dtype=np.float32) |
|
|
print("Treino - tipo:", type(x_train_clean)) |
|
|
print("Treino - shape:", x_train_clean.shape) |
|
|
print("Treino - dtype:", x_train_clean.dtype) |
|
|
print("Valores únicos de y:", np.unique(y_train, return_counts=True)) |
|
|
|
|
|
|
|
|
from lightgbm import LGBMClassifier |
|
|
|
|
|
model = LGBMClassifier( |
|
|
objective='binary', |
|
|
learning_rate=0.01, |
|
|
n_estimators=2000, |
|
|
random_state=42 |
|
|
) |
|
|
model.fit(x_train_clean, y_train) |
|
|
train_score = model.score(x_train_clean, y_train) |
|
|
print("Treino score:", train_score) |
|
|
|
|
|
from sklearn.metrics import accuracy_score |
|
|
|
|
|
y_pred = model.predict(x_val_clean) |
|
|
acc = accuracy_score(y_val, y_pred) |
|
|
print(f"Acurácia: {acc:.4f}") |
|
|
|
|
|
|
|
|
|
|
|
|