Skip to content

Instantly share code, notes, and snippets.

@AhmedCoolProjects
Created November 23, 2025 02:37
Show Gist options
  • Select an option

  • Save AhmedCoolProjects/7253fafbc96315bcbbb07366e614f689 to your computer and use it in GitHub Desktop.

Select an option

Save AhmedCoolProjects/7253fafbc96315bcbbb07366e614f689 to your computer and use it in GitHub Desktop.
Bi-LSTM Application.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPU/y966iE2TqAf9sRVlmPX",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/AhmedCoolProjects/7253fafbc96315bcbbb07366e614f689/bi-lstm-application.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "BInhqD3gb85U"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"source": [
"class LSTM:\n",
" def __init__(self, hidden_size, vocab_size, learning_rate=0.01):\n",
" self.H = hidden_size\n",
" self.V = vocab_size\n",
" self.lr = learning_rate\n",
"\n",
" # Initialize weights (4 sets)\n",
" # f: Forget, i: Input, c: Candidate, o: Output\n",
" self._W_size = (self.H, self.H + self.V) # weights shape for all gates\n",
" # Forget Gate\n",
" self.Wf = np.random.randn(*self._W_size) * 0.01\n",
" self.bf = np.zeros((self.H, 1))\n",
"\n",
" # Input Gate\n",
" self.Wi = np.random.randn(*self._W_size) * 0.01\n",
" self.bi = np.zeros((self.H, 1))\n",
"\n",
" # Candidate Layer\n",
" self.Wc = np.random.randn(*self._W_size) * 0.01\n",
" self.bc = np.zeros((self.H, 1))\n",
"\n",
" # Output Gate\n",
" self.Wo = np.random.randn(*self._W_size) * 0.01\n",
" self.bo = np.zeros((self.H, 1))\n",
"\n",
" # Final Output Layer\n",
" self.Why = np.random.randn(self.V, self.H) * 0.01\n",
" self.by = np.zeros((self.V, 1))\n",
"\n",
" def _sigmoid(self, z):\n",
" return 1 / (1 + np.exp(-z))\n",
"\n",
" def _tanh(self, z):\n",
" return np.tanh(z)\n",
"\n",
" def _softmax(self, z):\n",
" e_z = np.exp(z - np.max(z))\n",
" return e_z / np.sum(e_z, axis=0)\n",
"\n",
" def _sigmoid_derivative(self, z):\n",
" return z * (1 - z)\n",
"\n",
" def _tanh_derivative(self, z):\n",
" return 1 - np.tanh(z) ** 2\n",
"\n",
" def forward(self, inputs, h_prev, c_prev):\n",
" '''\n",
" inputs: list of input indices\n",
" h_prev: previous hidden state (H x 1)\n",
" c_prev: previous cell state (H x 1)\n",
" '''\n",
"\n",
" xs, hs, cs, zs, ys = {}, {}, {}, {}, {} # store values for each time step\n",
" fs, is_, cs_tilde, os = {}, {}, {}, {} # gate activations\n",
" concat_inputs = {}\n",
"\n",
" hs[-1] = np.copy(h_prev)\n",
" cs[-1] = np.copy(c_prev)\n",
"\n",
" total_cost = 0\n",
"\n",
" for t in range(len(inputs)):\n",
" # 1. One-hot encode the input character\n",
" xs[t] = np.zeros((self.V, 1))\n",
" xs[t][inputs[t]] = 1\n",
"\n",
" # 2. Concatenate h_prev and x_t\n",
" concat_inputs[t] = np.vstack((hs[t-1], xs[t])) # (H + V) x 1\n",
"\n",
" # 3. Forget Gate\n",
" fs[t] = self._sigmoid(np.dot(self.Wf, concat_inputs[t]) + self.bf)\n",
"\n",
" # 4. Input Gate\n",
" is_[t] = self._sigmoid(np.dot(self.Wi, concat_inputs[t]) + self.bi)\n",
"\n",
" # 5. Candidate Layer\n",
" cs_tilde[t] = self._tanh(np.dot(self.Wc, concat_inputs[t]) + self.bc)\n",
"\n",
" # 6. Output Gate\n",
" os[t] = self._sigmoid(np.dot(self.Wo, concat_inputs[t]) + self.bo)\n",
"\n",
" # 7. Update Cell State\n",
" cs[t] = fs[t] * cs[t-1] + is_[t] * cs_tilde[t] # (H x 1)\n",
"\n",
" # 8. Compute Hidden State\n",
" hs[t] = os[t] * self._tanh(cs[t]) # (H x 1)\n",
"\n",
" # 9. Compute Output\n",
" zs[t] = np.dot(self.Why, hs[t]) + self.by\n",
" ys[t] = self._softmax(zs[t]) # (V x 1)\n",
"\n",
" cache = (xs, hs, cs, fs, is_, cs_tilde, os, zs, ys, concat_inputs)\n",
"\n",
" return ys, hs[len(inputs)-1], cs[len(inputs)-1], cache\n",
"\n",
" def compute_cost(self, y_preds, targets):\n",
" total_cost = 0\n",
" for t in range(len(targets)):\n",
" prob_of_target = y_preds[t][targets[t], 0] # [target_index, 0] gives the prob of that target and the 0 is to get the scalar from the (1,1) array\n",
" total_cost += -np.log(prob_of_target + 1e-9) # add small value to avoid log(0)\n",
" return total_cost / len(targets)\n",
"\n",
" def backpropagation(self, targets, cache):\n",
" xs, hs, cs, fs, is_, cs_tilde, os, zs, ys, concat_inputs = cache\n",
" # Initialize gradients\n",
" self.dWf = np.zeros_like(self.Wf)\n",
" self.dbf = np.zeros_like(self.bf)\n",
" self.dWi = np.zeros_like(self.Wi)\n",
" self.dbi = np.zeros_like(self.bi)\n",
" self.dWc = np.zeros_like(self.Wc)\n",
" self.dbc = np.zeros_like(self.bc)\n",
" self.dWo = np.zeros_like(self.Wo)\n",
" self.dbo = np.zeros_like(self.bo)\n",
" self.dWhy = np.zeros_like(self.Why)\n",
" self.dby = np.zeros_like(self.by)\n",
"\n",
" dh_next = np.zeros_like(hs[0])\n",
" dc_next = np.zeros_like(cs[0])\n",
"\n",
" for t in reversed(range(len(targets))):\n",
" # 1. Output layer\n",
" dy = np.copy(ys[t])\n",
" dy[targets[t]] -= 1 # y_pred - y_true\n",
" self.dWhy += np.dot(dy, hs[t].T)\n",
" self.dby += dy\n",
"\n",
" # 2. Gradient for hidden state dh_t\n",
" dh = np.dot(self.Why.T, dy) + dh_next\n",
"\n",
" # 3. Gradient for output gate\n",
" do = dh * self._tanh(cs[t]) * os[t] * (1 - os[t])\n",
" self.dWo += np.dot(do, concat_inputs[t].T)\n",
" self.dbo += do\n",
"\n",
" # 4. Gradient for cell state\n",
" dc = dh * os[t] * (1 - self._tanh(cs[t])**2) + dc_next\n",
"\n",
" # 5. Gradient for forget gate\n",
" df = dc * cs[t-1] * fs[t] * (1 - fs[t])\n",
" self.dWf += np.dot(df, concat_inputs[t].T)\n",
" self.dbf += df\n",
"\n",
" # 6. Gradient for input gate\n",
" di = dc * cs_tilde[t] * is_[t] * (1 - is_[t])\n",
" self.dWi += np.dot(di, concat_inputs[t].T)\n",
" self.dbi += di\n",
"\n",
" # 7. Gradient for candidate layer\n",
" dc_tilde = dc * is_[t] * (1 - cs_tilde[t]**2)\n",
" self.dWc += np.dot(dc_tilde, concat_inputs[t].T)\n",
" self.dbc += dc_tilde\n",
"\n",
" # 8. Gradient for concatenated input\n",
" dconcat = (np.dot(self.Wf.T, df) +\n",
" np.dot(self.Wi.T, di) +\n",
" np.dot(self.Wc.T, dc_tilde) +\n",
" np.dot(self.Wo.T, do))\n",
" dh_next = dconcat[:self.H, :] # Gradient for h_(t-1)\n",
" dc_next = dc * fs[t] # Gradient for C_(t-1)\n",
"\n",
" # Gradient clipping to prevent exploding gradients\n",
" for grad in [self.dWf, self.dbf, self.dWi, self.dbi,\n",
" self.dWc, self.dbc, self.dWo, self.dbo,\n",
" self.dWhy, self.dby]:\n",
" np.clip(grad, -5, 5, out=grad)\n",
"\n",
" def update_parameters(self, learning_rate=0.01):\n",
" self.Wf -= learning_rate * self.dWf\n",
" self.bf -= learning_rate * self.dbf\n",
" self.Wi -= learning_rate * self.dWi\n",
" self.bi -= learning_rate * self.dbi\n",
" self.Wc -= learning_rate * self.dWc\n",
" self.bc -= learning_rate * self.dbc\n",
" self.Wo -= learning_rate * self.dWo\n",
" self.bo -= learning_rate * self.dbo\n",
" self.Why -= learning_rate * self.dWhy\n",
" self.by -= learning_rate * self.dby\n",
"\n",
" def sample(self, seed_idx, h_prev, c_prev, length=20):\n",
" x = np.zeros((self.V, 1))\n",
" x[seed_idx] = 1\n",
" indices = []\n",
"\n",
" for t in range(length):\n",
" concat_input = np.vstack((h_prev, x))\n",
"\n",
" # Gates\n",
" f = self._sigmoid(np.dot(self.Wf, concat_input) + self.bf)\n",
" i = self._sigmoid(np.dot(self.Wi, concat_input) + self.bi)\n",
" c_tilde = self._tanh(np.dot(self.Wc, concat_input) + self.bc)\n",
" o = self._sigmoid(np.dot(self.Wo, concat_input) + self.bo)\n",
"\n",
" # Update cell state\n",
" c = f * c_prev + i * c_tilde\n",
"\n",
" # Compute hidden state\n",
" h = o * self._tanh(c)\n",
"\n",
" z = np.dot(self.Why, h) + self.by\n",
" y = self._softmax(z)\n",
"\n",
" # Sample from the probability distribution\n",
" idx = np.random.choice(range(self.V), p=y.ravel()) # this gives us a scalar index meaning the index of the predicted character. the .ravel() is to convert the (V,1) shape to (V,) shape which is required by np.random.choice, this .choice samples according to the probabilities in y\n",
" x = np.zeros((self.V, 1))\n",
" x[idx] = 1\n",
" indices.append(idx)\n",
" h_prev = h\n",
" c_prev = c\n",
"\n",
" return indices"
],
"metadata": {
"id": "O-l2bzQRcBmj"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class BiLSTM:\n",
" def __init__(self, hidden_size, vocab_size, learning_rate=0.01):\n",
" self.H = hidden_size\n",
" self.V = vocab_size\n",
" self.lr = learning_rate\n",
"\n",
" # Two LSTM instances\n",
" self.f_lstm = LSTM(self.H, self.V, self.lr) # Forward LSTM\n",
" self.b_lstm = LSTM(self.H, self.V, self.lr) # Backward L\n",
"\n",
" # Combined output layer\n",
" # The input to this layer is (H_fwd + H_bwd) = 2H\n",
" self.Wy = np.random.randn(self.V, 2 * self.H) * 0.01\n",
" self.by = np.zeros((self.V, 1))\n",
"\n",
" # Gradients for output layer\n",
" self.dWy = np.zeros_like(self.Wy)\n",
" self.dby = np.zeros_like(self.by)\n",
"\n",
" def _softmax(self, z):\n",
" exp_z = np.exp(z - np.max(z, axis=0))\n",
" return exp_z / exp_z.sum(axis=0)\n",
"\n",
" def forward(self, inputs, h_prev_f, c_prev_f, h_prev_b, c_prev_b):\n",
" # 1. Run forward LSTM\n",
" _, _, _, cache_f = self.f_lstm.forward(inputs, h_prev_f, c_prev_f)\n",
" _, hs_f, _, _, _, _, _, _, _, _ = cache_f\n",
"\n",
" # 2. Run backward LSTM\n",
" rev_inputs = inputs[::-1]\n",
" _, _, _, cache_b = self.b_lstm.forward(rev_inputs, h_prev_b, c_prev_b)\n",
" _, rev_hs_b, _, _, _, _, _, _, _, _ = cache_b\n",
"\n",
" # 3. Combine outputs\n",
" ys, concat_hs = {}, {}\n",
"\n",
" # We need to match time steps.\n",
" # hs_f[0] corresponds to inputs[0]\n",
" # rev_hs_b[0] corresponds to inputs[T-1] (last time step)\n",
"\n",
" T = len(inputs)\n",
" for t in range(T):\n",
" h_f = hs_f[t] # forward hidden state for input t\n",
" h_b = rev_hs_b[T - 1 - t] # backward hidden state for input t\n",
"\n",
" concat_hs[t] = np.vstack((h_f, h_b)) # Concatenate\n",
" z = np.dot(self.Wy, concat_hs[t]) + self.by\n",
" ys[t] = self._softmax(z)\n",
"\n",
" cache = (cache_f, cache_b, concat_hs, ys)\n",
" return ys, cache\n",
"\n",
" def compute_cost(self, ys, targets):\n",
" total_cost = 0\n",
" for t in range(len(targets)):\n",
" prob = ys[t][targets[t], 0]\n",
" total_cost += -np.log(prob + 1e-9)\n",
" return total_cost / len(targets)\n",
"\n",
" def backpropagation(self, targets, cache):\n",
" cache_f, cache_b, concat_hs, ys = cache\n",
"\n",
" T = len(targets)\n",
"\n",
" # We need to collect gradients for the hidden states of both LSTMs\n",
" # to run their specific backprop logic.\n",
" dh_f_seq = {}\n",
" dh_b_seq = {}\n",
"\n",
" # 1. Backprop through the Combined output layer\n",
" for t in reversed(range(T)):\n",
" dy = np.copy(ys[t])\n",
" dy[targets[t]] -= 1\n",
"\n",
" self.dWy += np.dot(dy, concat_hs[t].T)\n",
" self.dby += dy\n",
"\n",
" # Calculate gradient for the concatenated hidden state\n",
" # dh_concat = Wy.T @ dy\n",
" dh_concat = np.dot(self.Wy.T, dy)\n",
"\n",
" # Split gradients back to forward and backward LSTM\n",
" dh_f_seq[t] = dh_concat[:self.H, :]\n",
" dh_b_seq[T - 1 - t] = dh_concat[self.H:, :]\n",
"\n",
" # 2. Backprop through Forward LSTM\n",
" self._lstm_backprop_manual(self.f_lstm, dh_f_seq, cache_f, T)\n",
" # 3. Backprop through Backward LSTM\n",
" self._lstm_backprop_manual(self.b_lstm, dh_b_seq, cache_b, T)\n",
"\n",
" def _lstm_backprop_manual(self, lstm, dh_seq, cache, T):\n",
" '''\n",
" Runs BPTT for a single LSTM given a sequence of hidden states gradients.\n",
" '''\n",
" # Correctly unpack the cache from LSTM.forward.\n",
" # The cache elements are: xs, hs, cs, fs_gate, is_gate, cs_tilde, os_gate, zs, ys, concat_inputs\n",
" xs_cache, hs_cache, cs_cache, fs_gate_cache, is_gate_cache, cs_tilde_cache, os_gate_cache, _, _, concat_inputs_cache = cache\n",
"\n",
" # Initialize LSTM gradients\n",
" lstm.dWf = np.zeros_like(lstm.Wf)\n",
" lstm.dWi = np.zeros_like(lstm.Wi)\n",
" lstm.dWc = np.zeros_like(lstm.Wc)\n",
" lstm.dWo = np.zeros_like(lstm.Wo)\n",
" lstm.dbf = np.zeros_like(lstm.bf)\n",
" lstm.dbi = np.zeros_like(lstm.bi)\n",
" lstm.dbc = np.zeros_like(lstm.bc)\n",
" lstm.dbo = np.zeros_like(lstm.bo)\n",
"\n",
" dh_next = np.zeros_like(hs_cache[0])\n",
" dc_next = np.zeros_like(cs_cache[0])\n",
"\n",
" for t in reversed(range(T)):\n",
" # Get the gradient from the combined layer above\n",
" dh_from_above = dh_seq[t]\n",
" # Total gradient for h_t\n",
" dh = dh_from_above + dh_next\n",
"\n",
" # ---- Standard BPTT ----\n",
" # Gradient for output gate (do)\n",
" do = dh * lstm._tanh(cs_cache[t])\n",
" do_raw = do * lstm._sigmoid_derivative(os_gate_cache[t])\n",
" lstm.dWo += np.dot(do_raw, concat_inputs_cache[t].T)\n",
" lstm.dbo += do_raw\n",
"\n",
" # Gradient for cell state (dc)\n",
" dc = dh * os_gate_cache[t] * lstm._tanh_derivative(cs_cache[t]) + dc_next\n",
"\n",
" # Gradient for candidate cell state (dc_cand)\n",
" dc_cand = dc * is_gate_cache[t]\n",
" dc_cand_raw = dc_cand * lstm._tanh_derivative(cs_tilde_cache[t])\n",
" lstm.dWc += np.dot(dc_cand_raw, concat_inputs_cache[t].T)\n",
" lstm.dbc += dc_cand_raw\n",
"\n",
" # Gradient for input gate (di)\n",
" di = dc * cs_tilde_cache[t]\n",
" di_raw = di * lstm._sigmoid_derivative(is_gate_cache[t])\n",
" lstm.dWi += np.dot(di_raw, concat_inputs_cache[t].T)\n",
" lstm.dbi += di_raw\n",
"\n",
" # Gradient for forget gate (df)\n",
" df = dc * cs_cache[t-1]\n",
" df_raw = df * lstm._sigmoid_derivative(fs_gate_cache[t])\n",
" lstm.dWf += np.dot(df_raw, concat_inputs_cache[t].T)\n",
" lstm.dbf += df_raw\n",
"\n",
" # Gradient for concatenated input\n",
" dconcat = (np.dot(lstm.Wf.T, df_raw) +\n",
" np.dot(lstm.Wi.T, di_raw) +\n",
" np.dot(lstm.Wc.T, dc_cand_raw) +\n",
" np.dot(lstm.Wo.T, do_raw))\n",
"\n",
" dh_next = dconcat[:lstm.H, :] # Gradient for h_(t-1)\n",
" dc_next = dc * fs_gate_cache[t] # Gradient for C_(t-1)\n",
"\n",
" # Clip gradients\n",
" for grad in [lstm.dWf, lstm.dWi, lstm.dWc, lstm.dWo, lstm.dbf, lstm.dbi, lstm.dbc, lstm.dbo]:\n",
" np.clip(grad, -5, 5, out=grad)\n",
"\n",
" def update_parameters(self):\n",
" # Combined Layer\n",
" self.Wy -= self.lr * self.dWy\n",
" self.by -= self.lr * self.dby\n",
"\n",
" # Inner LSTMs\n",
" self._lstm_update_manual(self.f_lstm)\n",
" self._lstm_update_manual(self.b_lstm)\n",
"\n",
" def _lstm_update_manual(self, lstm):\n",
" lstm.Wf -= self.lr * lstm.dWf\n",
" lstm.Wi -= self.lr * lstm.dWi\n",
" lstm.Wc -= self.lr * lstm.dWc\n",
" lstm.Wo -= self.lr * lstm.dWo\n",
" lstm.bf -= self.lr * lstm.dbf\n",
" lstm.bi -= self.lr * lstm.dbi\n",
" lstm.bc -= self.lr * lstm.dbc\n",
" lstm.bo -= self.lr * lstm.dbo\n",
"\n",
" def sample(self, seed_idx, h_prev_fwd, c_prev_fwd, length=20):\n",
" \"\"\"\n",
" Generates text using ONLY the Forward LSTM.\n",
" Ideally, Bi-LSTMs are not used for generation, but this is a demonstration.\n",
" We feed zeros for the backward state.\n",
" \"\"\"\n",
" x = np.zeros((self.V, 1))\n",
" x[seed_idx] = 1\n",
"\n",
" generated_indices = []\n",
"\n",
" # Placeholder for backward state (zeros)\n",
" h_fake_bwd = np.zeros((self.H, 1))\n",
"\n",
" for t in range(length):\n",
" # --- 1. Run One Step of Forward LSTM ---\n",
" # (We manually run the step logic here since lstm.forward_pass does a loop)\n",
"\n",
" # Concatenate input [h_prev, x]\n",
" concat_input = np.vstack((h_prev_fwd, x))\n",
"\n",
""
],
"metadata": {
"id": "6_VfzGIscIpE"
},
"execution_count": 33,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 1. prepare data\n",
"data = \"helloahmed\"\n",
"chars = list(set(data))\n",
"vocab_size = len(chars)\n",
"char_to_idx = { ch:i for i,ch in enumerate(chars)}\n",
"idx_to_char = { i:ch for i,ch in enumerate(chars)}\n",
"\n",
"print(f\"Data: {data}\")\n",
"print(f\"Vocabulary: {chars}\")\n",
"print(f\"Vocab Size: {vocab_size}\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aBw74HwQcNZO",
"outputId": "c7990f0a-a17a-45ef-aacd-e5992485d4d2"
},
"execution_count": 45,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Data: helloahmed\n",
"Vocabulary: ['h', 'e', 'a', 'm', 'o', 'l', 'd']\n",
"Vocab Size: 7\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# 2. Create Model\n",
"hidden_size = 25\n",
"learning_rate = 0.01\n",
"epochs = 500 # Fewer epochs needed because Bi-LSTM learns very fast"
],
"metadata": {
"id": "3yfCJF4kcQll"
},
"execution_count": 51,
"outputs": []
},
{
"cell_type": "code",
"source": [
"bilstm = BiLSTM(hidden_size, vocab_size, learning_rate)"
],
"metadata": {
"id": "fY1OZ3oRcS9L"
},
"execution_count": 52,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 3. Training Loop\n",
"print(\"Training Bi-LSTM...\")\n",
"costs = []\n",
"\n",
"inputs = [char_to_idx[ch] for ch in data[:-1]]\n",
"targets = [char_to_idx[ch] for ch in data[1:]]\n",
"\n",
"for epoch in range(epochs):\n",
" # Initialize states for BOTH LSTMs\n",
" h_prev_fwd = np.zeros((hidden_size, 1))\n",
" c_prev_fwd = np.zeros((hidden_size, 1))\n",
" h_prev_bwd = np.zeros((hidden_size, 1))\n",
" c_prev_bwd = np.zeros((hidden_size, 1))\n",
"\n",
" # Forward pass\n",
" y_preds, cache = bilstm.forward(inputs, h_prev_fwd, c_prev_fwd, h_prev_bwd, c_prev_bwd)\n",
"\n",
" # Compute cost\n",
" cost = bilstm.compute_cost(y_preds, targets)\n",
"\n",
" # Backpropagation\n",
" bilstm.backpropagation(targets, cache)\n",
"\n",
" # Update parameters\n",
" bilstm.update_parameters()\n",
"\n",
" if epoch % 100 == 0:\n",
" print(f\"Epoch {epoch}, Cost: {cost:.4f}\")\n",
" costs.append(cost)\n",
"\n",
"print(\"Training complete.\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "R7Fhomt3cVip",
"outputId": "1272c289-a8ea-4982-f96f-ba6cc749fbb5"
},
"execution_count": 53,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Training Bi-LSTM...\n",
"Epoch 0, Cost: 1.9458\n",
"Epoch 100, Cost: 1.8416\n",
"Epoch 200, Cost: 0.2916\n",
"Epoch 300, Cost: 0.0082\n",
"Epoch 400, Cost: 0.0003\n",
"Training complete.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"plt.plot(np.squeeze(costs))\n",
"plt.ylabel('Cost')\n",
"plt.xlabel('Epochs (per 100)')\n",
"plt.title(f\"Bi-LSTM Training\")\n",
"plt.show()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 472
},
"id": "2uvoGm27cx3G",
"outputId": "0b1f4e38-2ae4-4878-a8c4-10add990b543"
},
"execution_count": 54,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
],
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"# Test the model (sampling)\n",
"print(\"\\nSampling from the model:\")\n",
"# Get the index for our seed character 'h'\n",
"seed_char_idx = char_to_idx['h']\n",
"\n",
"# For sampling, we only need the initial forward states\n",
"h_sample_fwd = np.zeros((hidden_size, 1))\n",
"c_sample_fwd = np.zeros((hidden_size, 1))\n",
"\n",
"generated_indices = bilstm.sample(seed_char_idx, h_sample_fwd, c_sample_fwd, length=10)\n",
"\n",
"generated_text = 'h' + ''.join(idx_to_char[idx] for idx in generated_indices)\n",
"\n",
"print(f\"Generated text: '{generated_text}'\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "56gClwK1fF8a",
"outputId": "7646ba2f-bedd-45af-9197-accdcf8437c5"
},
"execution_count": 57,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Sampling from the model:\n",
"Generated text: 'heddddddddd'\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "sM4fD2LmePff"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment