-
-
Save lavoiesl/9a08e399fc9832d12794 to your computer and use it in GitHub Desktop.
| // gcc -O2 -Wall -pedantic process-mysqldump.c -o process-mysqldump | |
| // Usage: cat dump.sql | process-mysqldump | |
| // Or : process-mysqldump dump.sql | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <stdbool.h> | |
| #include <string.h> | |
| #define BUFFER 100000 | |
| bool is_escaped(char* string, int offset) { | |
| if (offset == 0) { | |
| return false; | |
| } else if (string[offset - 1] == '\\') { | |
| return !is_escaped(string, offset - 1); | |
| } else { | |
| return false; | |
| } | |
| } | |
| bool is_commented(char* string) { | |
| char buffer[4]; | |
| sprintf(buffer, "%.3s", string); | |
| return strcmp(buffer, "-- ") == 0; | |
| } | |
| int main(int argc, char *argv[]) | |
| { | |
| FILE* file = argc > 1 ? fopen(argv[1], "r") : stdin; | |
| char buffer[BUFFER]; | |
| char* line; | |
| int pos; | |
| int parenthesis = 0; | |
| bool quote = false; | |
| bool escape = false; | |
| bool comment = false; | |
| while (fgets(buffer, BUFFER, file) != NULL) { | |
| line = buffer; | |
| // skip commented | |
| if (comment || is_commented(line)) { | |
| comment = line[strlen(line) - 1] != '\n'; | |
| fputs(line, stdout); | |
| } else { | |
| pos = 0; | |
| nullchar: | |
| while (line[pos] != '\0') { | |
| // if we are still in escape state, we need to check first char. | |
| if (!escape) { | |
| // find any character in ()' | |
| pos = strcspn(line, "()'\\"); | |
| } | |
| if (pos > 0) { | |
| // print before match | |
| printf("%.*s", pos, line); | |
| } | |
| switch (line[pos]) { | |
| case '(': | |
| if (!quote) { | |
| if (parenthesis == 0) { | |
| putchar('\n'); | |
| } | |
| parenthesis++; | |
| } | |
| if (escape) { | |
| escape = false; | |
| } | |
| break; | |
| case ')': | |
| if (!quote) { | |
| if (parenthesis > 0) { | |
| parenthesis--; | |
| } else { | |
| // whoops | |
| puts("\n"); | |
| fputs(line, stdout); | |
| fputs("Found closing parenthesis without opening one.\n", stderr); | |
| exit(1); | |
| } | |
| } | |
| if (escape) { | |
| escape = false; | |
| } | |
| break; | |
| case '\\': | |
| escape = !escape; | |
| break; | |
| case '\'': | |
| if (escape) { | |
| escape = false; | |
| } else { | |
| quote = !quote; | |
| } | |
| break; | |
| case '\0': | |
| goto nullchar; | |
| default: | |
| if (escape) { | |
| escape = false; | |
| } | |
| break; | |
| } | |
| // print char then skip it (to make sure we don’t double match) | |
| putchar(line[pos]); | |
| line = line + pos + 1; | |
| pos = 0; | |
| } | |
| } | |
| } | |
| return 0; | |
| } |
| <?php | |
| // Usage: cat dump.sql | php process-mysqldump.php | |
| $input = fopen('php://stdin', 'r'); | |
| while(!feof($input)) { | |
| $line = fgets($input); | |
| if (substr($line, 0, 6) == 'INSERT') { | |
| process_line($line); | |
| } else { | |
| echo $line; | |
| } | |
| } | |
| function process_line($line) { | |
| $length = strlen($line); | |
| $pos = strpos($line, ' VALUES ') + 8; | |
| echo substr($line, 0, $pos); | |
| $parenthesis = false; | |
| $quote = false; | |
| $escape = false; | |
| for ($i = $pos; $i < $length; $i++) { | |
| switch($line[$i]) { | |
| case '(': | |
| if (!$quote) { | |
| if ($parenthesis) { | |
| throw new Exception('double open parenthesis'); | |
| } else { | |
| echo PHP_EOL; | |
| $parenthesis = true; | |
| } | |
| } | |
| $escape = false; | |
| break; | |
| case ')': | |
| if (!$quote) { | |
| if ($parenthesis) { | |
| $parenthesis = false; | |
| } else { | |
| throw new Exception('closing parenthesis without open'); | |
| } | |
| } | |
| $escape = false; | |
| break; | |
| case '\\': | |
| $escape = !$escape; | |
| break; | |
| case "'": | |
| if ($escape) { | |
| $escape = false; | |
| } else { | |
| $quote = !$quote; | |
| } | |
| break; | |
| default: | |
| $escape = false; | |
| break; | |
| } | |
| echo $line[$i]; | |
| } | |
| } | |
| fclose($input); |
good job, thanks! couple of notes:
change the open mode from "r" to "rb", otherwise you might suffer silent data corruption when exporting on a windows system (values with \n might suddenly become \r\n when exported on windows - and linux is unaffected by this change)
also, the PHP version would probably be faster if you dropped $input and used STDIN instead (magic constant, i think fopen(php://stdin) use some ugly stream_copy_to_stream stuff behind the scene, because it actually creates a new file descriptor when inspecting /proc/ , at least on linux...)
also, in PHP, pre-increment (++$i) is faster than post-increment ($i++) - this is also true for C, but gcc auto-optimizes that on -O1 and above.
also, replace this
while(!feof($input)) {
$line = fgets($input);
with this
while(false!==($line = fgets($input)) ) {
and it will be functionally equivalent, but faster (saving a syscall at least - keep in mind that STDIN is in blocking mode by default, and fgets() returns bool(false) on EOF, which is the only thing you were using feof() for)
- also, the PHP code could probably get a lot faster by doing ob_start() ob_end_flush() between process_line, because stupid php use write() instead of fwrite() internally, which is completely unbuffered, php basically does the equivalent of printf(...); fflush(stdout) on every echo.. which, in this code, is every byte (
echo $line[$i];prints 1 byte at a time).. doing
if (substr($line, 0, 6) == 'INSERT') {
ob_start();
process_line($line);
ob_end_flush();
} else {
would probably make the script much faster. (and modifying process_line to return a string instead of printing, and do echo process_line($line); would probably be even faster than ob_start/end_flush)
(also something could be said about adding 100000 as the 2nd argument to fgets, it would probably get faster, but i haven't done benchmarks, and im not 100% positive - also it would introduce the 100000 quote C-only bug in PHP)
here is an optimized php version: https://gist.github.com/divinity76/494fef3d2ea1b800bb98e13db9c3ec3a#file-process-mysqldump-php
here are some benchmarks, comparing the C version with the original PHP version with the optimized PHP version:
./bench.php 'cat dump.sql | ./process-mysqldump' >/dev/null
0.268805
0.275636
0.277831
0.274316
0.308055
./bench.php 'cat dump.sql | php original.php' >/dev/null
11.146122
11.119936
10.938375
11.308797
10.875135
./bench.php 'cat dump.sql | php optimized.php' >/dev/null
2.930665
2.974056
2.947326
2.916559
2.994324
- the optimized PHP version is now about 3.7 times faster than the original, while still being about 10.7 times slower than the C version..
¯\_(ツ)_/¯
du -h dump.sql
1.1M dump.sql
bench.php: https://gist.github.com/divinity76/b0471cfa6596c92dd6aba2078ea0fde8
(also, i can optimize the PHP version even further, but i won't bother, i wouldn't be able to gain much speed at this point anyway - specifically, i can optimize away a few function call overheads)
did some further optimizations to the PHP version, now it's about 8.4 times faster than the original PHP version,
and about 4.8 times slower than the C version.
1.323755
1.353491
1.351569
1.332137
1.386095
- could speed it up a lot by doing
$ret .= $line[$i];
$to = strcspn($line, '()\\\'', $i+1);
if ($to !== 0) {
$ret .= substr($line, $i+1, $to);
$i += ($to);
}What license is this code?
Hello.
Thanks for your great work, but there is a bug. If someone writes in a varchar "-- " at the 100000th character, your code will think it's a comment. In the C version.
You should replace the "if(comment ..." with "if (!quote && (comment || is_commented(line))) {"
It took me a long time to figure that bug out... Thanks a lot
Thanks a lot, very useful!