walling · June 21, 2017 20:30
diff --git a/readme.md b/readme.md
diff --git a/example1-first-usage.js b/example1-first-usage.js
 var conllu = require('conllu-stream');
 var fs     = require('fs');

 fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
    .pipe(conllu())
    .on('data', sentence => {
        console.log(sentence.features.sent_id, sentence.toString());
    });
diff --git a/example2-word-lemma-histograms.js b/example2-word-lemma-histograms.js
 var _      = require('lodash');
 var conllu = require('conllu-stream');
 var fs     = require('fs');

 // Function to print value as percent (nicely).
 function percent(value) {
    value *= 100;
    return  isNaN(value) ? '' :
            value > 10   ? value.toPrecision(3)+'%' :
            value > 1    ? value.toPrecision(2)+'%' :
                           value.toPrecision(1)+'%';
 }

 // Function to calculate and display histogram.
 // It first calculates the histogram of the primary `key`.
 // For each key it calculates a secondary histogram of the seconday `linkKey`.
 function displayHistogram(words, key, linkKey) {

    // Calculate `key` histogram of words, sorted by the frequenzy.
    var grouped   = _.groupBy(words, key);
    var histogram = _(grouped)
        .mapValues('length')
        .toPairs()
        .sortBy([ 1, 0 ])
        .reverse()
        .value();

    // Get top-10 and bottom-10 parts of the histogram.
    var top       = histogram.slice(0, 10);
    var bottom    = histogram.slice(-10);

    // Combine top and bottom parts to display.
    var entries   = top.concat([['--']]).concat(bottom);

    // For each entry, calculate top-6 of secondary `linkKey` histogram.
    entries.forEach(entry => {
        entry[2] = percent(entry[1] / words.length);
        var linked = _.map(grouped[entry[0]], linkKey);
        entry[3] =
            _(linked)
            .groupBy()
            .mapValues('length')
            .toPairs()
            .sortBy([ 1, 0 ])
            .reverse()
            // Show percent values for each item.
            .map(item => `${item[0]} (${percent(item[1]/linked.length)})`)
            .slice(0, 6)
            .join(', ');
    });

    // Display table of results.
    console.log('-- %s --', key);
    console.log();
    console.log(entries.map(entry => entry.join('\t')).join('\n'));
    console.log();
    console.log('#words     :', words.length);
    console.log('#histogram :', histogram.length);
    console.log();

 }


 // Array to store all word objects we encounter.
 var words = [];

 // Parse CoNLL-U file.
 fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
    .pipe(conllu())
    .on('data', sentence => {
        // Collect all words and lemmas in lowercase (except punctuation/numbers).
        sentence.getSequence()
            .filter(word => [ 'PUNCT', 'NUM' ].indexOf(word.upostag) === -1)
            .forEach(word => {
                // Using lowercase so "Haus" and "haus" are counted together.
                word.form  = word.form.toLowerCase();
                word.lemma = word.lemma.toLowerCase();
                words.push(word);
            });
    })
    .on('end', () => {
        // Calculate and display histograms of words vs. lemmas and vice versa.
        displayHistogram(words, 'form', 'lemma');
        displayHistogram(words, 'lemma', 'form');
        console.log('-- done --');
    });
diff --git a/example3-multiword-histogram.js b/example3-multiword-histogram.js
 var _      = require('lodash');
 var conllu = require('conllu-stream');
 var fs     = require('fs');

 var multiwords = [];

 fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
    .pipe(conllu())
    .on('data', sentence => {
        // Collect all words and lemmas in lowercase.
        sentence.structure.multiwords
            .map(id => sentence.tokens[id])
            .forEach(multiword => {
                // Get expanded form of the multiword.
                var expansion =
                    _.range(multiword.position, multiword.endPosition+1)
                    .map(id => sentence.tokens[''+id].form)
                    .join(' ');

                // Store multiword and its expansion.
                multiwords.push(multiword.form.toLowerCase() +
                    '\t-->\t' + expansion.toLowerCase());
            });
    })
    .on('end', () => {
        // Calculate and show histogram sorted by frequency.
        console.log(
            _(multiwords)
            .groupBy()
            .mapValues('length')
            .toPairs()
            .sortBy([ 1, 0 ])
            .reverse()
            .map(row => row.join('\t\t'))
            .join('\n')
        );
    });
diff --git a/package.json b/package.json
 {
  "dependencies": {
    "conllu-stream": "0.0.1",
    "lodash": "^4.17.4"
  }
 }
diff --git a/sentence.conllu b/sentence.conllu
 # sent_id = train-s2
 # text = Die Kosten sind definitiv auch im Rahmen.
 #id	form	lemma	upostag	xpostag	feats	head	deprel	deps	misc
 1	Die	der	DET	ART	Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art	2	det	_	_
 2	Kosten	Kosten	NOUN	NN	Case=Nom|Gender=Fem|Number=Sing	3	nsubj:pass	_	_
 3	sind	sein	VERB	VAFIN	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_
 4	definitiv	definitiv	ADV	ADJD	_	3	advmod	_	_
 5	auch	auch	ADV	ADV	_	3	advmod	_	_
 6-7	im	_	_	_	_	_	_	_	_
 6	in	in	ADP	APPR	_	8	case	_	_
 7	dem	der	DET	ART	Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art	8	det	_	_
 8	Rahmen	Rahmen	NOUN	NN	Case=Dat|Gender=Masc,Neut|Number=Sing	3	obl	_	SpaceAfter=No
 9	.	.	PUNCT	$.	_	3	punct	_	_
	var conllu = require('conllu-stream');
	var fs = require('fs');

	fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
	.pipe(conllu())
	.on('data', sentence => {
	console.log(sentence.features.sent_id, sentence.toString());
	});
	var _ = require('lodash');
	var conllu = require('conllu-stream');
	var fs = require('fs');

	// Function to print value as percent (nicely).
	function percent(value) {
	value *= 100;
	return isNaN(value) ? '' :
	value > 10 ? value.toPrecision(3)+'%' :
	value > 1 ? value.toPrecision(2)+'%' :
	value.toPrecision(1)+'%';
	}

	// Function to calculate and display histogram.
	// It first calculates the histogram of the primary `key`.
	// For each key it calculates a secondary histogram of the seconday `linkKey`.
	function displayHistogram(words, key, linkKey) {

	// Calculate `key` histogram of words, sorted by the frequenzy.
	var grouped = _.groupBy(words, key);
	var histogram = _(grouped)
	.mapValues('length')
	.toPairs()
	.sortBy([ 1, 0 ])
	.reverse()
	.value();

	// Get top-10 and bottom-10 parts of the histogram.
	var top = histogram.slice(0, 10);
	var bottom = histogram.slice(-10);

	// Combine top and bottom parts to display.
	var entries = top.concat([['--']]).concat(bottom);

	// For each entry, calculate top-6 of secondary `linkKey` histogram.
	entries.forEach(entry => {
	entry[2] = percent(entry[1] / words.length);
	var linked = _.map(grouped[entry[0]], linkKey);
	entry[3] =
	_(linked)
	.groupBy()
	.mapValues('length')
	.toPairs()
	.sortBy([ 1, 0 ])
	.reverse()
	// Show percent values for each item.
	.map(item => `${item[0]} (${percent(item[1]/linked.length)})`)
	.slice(0, 6)
	.join(', ');
	});

	// Display table of results.
	console.log('-- %s --', key);
	console.log();
	console.log(entries.map(entry => entry.join('\t')).join('\n'));
	console.log();
	console.log('#words :', words.length);
	console.log('#histogram :', histogram.length);
	console.log();

	}


	// Array to store all word objects we encounter.
	var words = [];

	// Parse CoNLL-U file.
	fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
	.pipe(conllu())
	.on('data', sentence => {
	// Collect all words and lemmas in lowercase (except punctuation/numbers).
	sentence.getSequence()
	.filter(word => [ 'PUNCT', 'NUM' ].indexOf(word.upostag) === -1)
	.forEach(word => {
	// Using lowercase so "Haus" and "haus" are counted together.
	word.form = word.form.toLowerCase();
	word.lemma = word.lemma.toLowerCase();
	words.push(word);
	});
	})
	.on('end', () => {
	// Calculate and display histograms of words vs. lemmas and vice versa.
	displayHistogram(words, 'form', 'lemma');
	displayHistogram(words, 'lemma', 'form');
	console.log('-- done --');
	});
	{
	"dependencies": {
	"conllu-stream": "0.0.1",
	"lodash": "^4.17.4"
	}
	}
	# sent_id = train-s2
	# text = Die Kosten sind definitiv auch im Rahmen.
	#id form lemma upostag xpostag feats head deprel deps misc
	1 Die der DET ART Case=Nom\|Definite=Def\|Gender=Fem\|Number=Sing\|PronType=Art 2 det _ _
	2 Kosten Kosten NOUN NN Case=Nom\|Gender=Fem\|Number=Sing 3 nsubj:pass _ _
	3 sind sein VERB VAFIN Mood=Ind\|Number=Sing\|Person=3\|Tense=Pres\|VerbForm=Fin 0 root _ _
	4 definitiv definitiv ADV ADJD _ 3 advmod _ _
	5 auch auch ADV ADV _ 3 advmod _ _
	6-7 im _ _ _ _ _ _ _ _
	6 in in ADP APPR _ 8 case _ _
	7 dem der DET ART Case=Dat\|Definite=Def\|Gender=Masc,Neut\|Number=Sing\|PronType=Art 8 det _ _
	8 Rahmen Rahmen NOUN NN Case=Dat\|Gender=Masc,Neut\|Number=Sing 3 obl _ SpaceAfter=No
	9 . . PUNCT $. _ 3 punct _ _