-
-
Save arrufat/062b8847b7f87465efd96d627dadf1ad to your computer and use it in GitHub Desktop.
| #include <algorithm> | |
| #include <iostream> | |
| #include <dlib/data_io.h> | |
| #include <dlib/dnn.h> | |
| #include <dlib/gui_widgets.h> | |
| #include <dlib/matrix.h> | |
| using namespace std; | |
| using namespace dlib; | |
| // some helper definitions for the noise generation | |
| constexpr size_t noise_size = 100; | |
| // constexpr size_t input_size = 28; | |
| using noise_t = std::array<matrix<float, 1, 1>, noise_size>; | |
| noise_t make_noise() | |
| { | |
| noise_t noise; | |
| std::for_each(begin(noise), end(noise), [] (matrix<float, 1, 1> &m) { | |
| m = matrix_cast<float>(randm(1, 1)); | |
| }); | |
| return noise; | |
| } | |
| // a custom convolution definition to allow for padding size specification | |
| template<long num_filters, long kernel_size, int stride, int padding, typename SUBNET> | |
| using conp = add_layer<con_<num_filters, kernel_size, kernel_size, stride, stride, padding, padding>, SUBNET>; | |
| // Let's define a transposed convolution to with custom padding | |
| template<long num_filters, long kernel_size, int stride, int padding, typename SUBNET> | |
| using contp = add_layer<cont_<num_filters, kernel_size, kernel_size, stride, stride, padding, padding>, SUBNET>; | |
| // The generator is made of a bunch of deconvolutional layers. It's input is a | |
| // 1 x 1 x k noise tensor, and the output is the score of the generated image | |
| // (decided by the discriminator, which we'll define afterwards) | |
| template<typename SUBNET> | |
| using generator_type = | |
| loss_binary_log<fc<1, | |
| htan<contp<1, 4, 2, 1, | |
| relu<bn_con<contp<64, 4, 2, 1, | |
| relu<bn_con<contp<128, 3, 2, 1, | |
| relu<bn_con<contp<256, 4, 1, 0, | |
| SUBNET>>>>>>>>>>>>>; | |
| // input<std::array<matrix<double, 1, 1>, noise_size>> | |
| // Now, let's proceed to define the discriminator, whose role will be to decide | |
| // whether an image is fake or not. | |
| template<typename SUBNET> | |
| using discriminator_type = | |
| loss_binary_log<sig< | |
| bn_con<conp<1, 3, 1, 0, | |
| prelu<bn_con<conp<256, 4, 2, 1, | |
| prelu<bn_con<conp<128, 4, 2, 1, | |
| prelu<conp<64, 4, 2, 1, | |
| SUBNET>>>>>>>>>>>>; | |
| template<typename NET> | |
| matrix<unsigned char> generated_image(NET net) | |
| { | |
| matrix<float> output = image_plane(layer<2>(net).get_output()); | |
| matrix<unsigned char> image; | |
| assign_image_scaled(image, output); | |
| return image; | |
| } | |
| // // Now, let's define a way to easily get the generated image | |
| // matrix<unsigned char> get_image(dcgan_type net) | |
| // { | |
| // matrix<float> output = image_plane(layer<tag4>(net).get_output()); | |
| // matrix<unsigned char> result; | |
| // assign_image_scaled(result, output); | |
| // return result; | |
| // } | |
| int main(int argc, char** argv) try | |
| { | |
| // This example is going to run on the MNIST dataset. | |
| if (argc != 2) | |
| { | |
| cout << "This example needs the MNIST dataset to run!" << endl; | |
| cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl; | |
| cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl; | |
| cout << "put them in a folder. Then give that folder as input to this program." << endl; | |
| return EXIT_FAILURE; | |
| } | |
| // MNIST is broken into two parts, a training set of 60000 images and a test set of | |
| // 10000 images. Each image is labeled so that we know what hand written digit is | |
| // depicted. These next statements load the dataset into memory. | |
| std::vector<matrix<unsigned char>> training_images; | |
| std::vector<unsigned long> training_labels; | |
| std::vector<matrix<unsigned char>> testing_images; | |
| std::vector<unsigned long> testing_labels; | |
| load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels); | |
| generator_type<input<noise_t>> generator; | |
| discriminator_type<input<matrix<unsigned char>>> discriminator( | |
| prelu_(0.2), prelu_(0.2), prelu_(0.2)); | |
| cout << generator << endl; | |
| cout << discriminator << endl; | |
| dnn_trainer<decltype(generator), adam> gen_trainer(generator, adam(0, 0.5, 0.999)); | |
| gen_trainer.set_synchronization_file("dcgan_generator_sync", std::chrono::minutes(5)); | |
| gen_trainer.be_verbose(); | |
| gen_trainer.set_learning_rate(2e-4); | |
| cout << gen_trainer << endl; | |
| dnn_trainer<decltype(discriminator), adam> dis_trainer(discriminator, adam(0, 0.5, 0.999)); | |
| dis_trainer.set_synchronization_file("dcgan_discriminator_sync", std::chrono::minutes(5)); | |
| dis_trainer.be_verbose(); | |
| dis_trainer.set_learning_rate(2e-4); | |
| cout << dis_trainer << endl; | |
| const long minibatch_size = 256; | |
| dlib::rand rnd(time(0)); | |
| // image_window win; | |
| // win.set_title("DCGAN example"); | |
| while (gen_trainer.get_train_one_step_calls() < 1000) | |
| { | |
| // train the discriminator with real images | |
| std::vector<matrix<unsigned char>> mini_batch_real_samples; | |
| std::vector<float> mini_batch_real_labels; | |
| while (mini_batch_real_samples.size() < minibatch_size) | |
| { | |
| auto idx = rnd.get_random_32bit_number() % training_images.size(); | |
| mini_batch_real_samples.push_back(training_images[idx]); | |
| mini_batch_real_labels.push_back(rnd.get_double_in_range(0.8, 1.0)); | |
| } | |
| dis_trainer.train_one_step(mini_batch_real_samples, mini_batch_real_labels); | |
| resizable_tensor loss_real = discriminator.subnet().get_gradient_input(); | |
| // train the discriminator with fake images | |
| std::vector<matrix<unsigned char>> mini_batch_fake_samples; | |
| std::vector<float> mini_batch_fake_labels; | |
| std::vector<noise_t> noises; | |
| while (mini_batch_fake_samples.size() < minibatch_size) | |
| { | |
| auto noise = make_noise(); | |
| noises.push_back(noise); | |
| generator(noise); | |
| matrix<unsigned char> fake_img = generated_image(generator); | |
| mini_batch_fake_samples.push_back(fake_img); | |
| mini_batch_fake_labels.push_back(-1.0f); | |
| } | |
| // dis_trainer.get_net(force_flush_to_disk::no); | |
| dis_trainer.train_one_step(mini_batch_fake_samples, mini_batch_fake_labels); | |
| resizable_tensor loss_fake = discriminator.subnet().get_gradient_input(); | |
| generator.subnet().back_propagate_error(loss_fake); | |
| } | |
| return EXIT_SUCCESS; | |
| } | |
| catch(exception& e) | |
| { | |
| cout << e.what() << endl; | |
| return EXIT_FAILURE; | |
| } |
Yes, I'm not sure either... Please, let me know if you find something interesting :D
Hi, I updated the code here: https://gist.github.com/Cydral/92be4e848551429ec1a6919d6d813c08.
I used another approach for the formalization of the G and D networks but at the end, it's very close to your own code. This seems to work overall... except for the back propagation of the loss tensor values. Maybe could Davis really advise us on that?
By the way, it would only work for a single plane for the moment; I had initially made a version to infer a RGB image but I have a problem to get a 3D matrix formalizing a reconstructed image from the generator outputs. Maybe it is necessary to extract each scalar values of k and rebuild a RGB image from these different planes (I haven't tried yet)?
Yes, I also saw the sample you mentioned. I like this approach because that's - I assume - on track with the idea behind the unsupervised approach proposed by Radford & Co. This critical and central part:
"resizable_tensor loss_fake = discriminator.subnet().get_gradient_input();
generator.subnet().back_propagate_error(loss_fake);"
is for me the biggest difficulty because I studied Dlib's code and I'm not really sure that's the way to get the value of the loss function. Well, I'll look at it today and keep you informed tomorrow.