import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# Optional: Suppress TensorFlow info logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Step 1: Load CSV
df = pd.read_csv('data.csv')

# Step 2: Clean Data (drop any missing values)
df.dropna(inplace=True)

# Step 3: Split Data
texts = df['text'].astype(str).tolist()
labels = df['label'].tolist()

x_train_texts, x_test_texts, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Step 4: Tokenization and Padding
vocab_size = 1000
maxlen = 50
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train_texts)

x_train_seq = tokenizer.texts_to_sequences(x_train_texts)
x_test_seq = tokenizer.texts_to_sequences(x_test_texts)

x_train_pad = pad_sequences(x_train_seq, maxlen=maxlen, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=maxlen, padding='post', truncating='post')

# Convert labels to NumPy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Step 5: Build the Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Step 6: Train the Model
history = model.fit(
    x_train_pad, y_train,
    epochs=10,
    batch_size=4,
    validation_split=0.2
)

# Step 7: Evaluate the Model
loss, acc = model.evaluate(x_test_pad, y_test)
print(f"Test Accuracy: {acc:.2f}")

# Step 8: Visualize Accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Step 9: Predict on New Samples
sample_texts = ["I really enjoyed the movie!", "It was terrible and boring."]
sample_seq = tokenizer.texts_to_sequences(sample_texts)
sample_pad = pad_sequences(sample_seq, maxlen=maxlen, padding='post')

predictions = model.predict(sample_pad)

for i, text in enumerate(sample_texts):
    sentiment = 'Positive' if predictions[i] > 0.5 else 'Negative'
    print(f"Text: {text} => Prediction: {sentiment} ({predictions[i][0]:.2f})")