|
var _ = require('lodash'); |
|
var conllu = require('conllu-stream'); |
|
var fs = require('fs'); |
|
|
|
// Function to print value as percent (nicely). |
|
function percent(value) { |
|
value *= 100; |
|
return isNaN(value) ? '' : |
|
value > 10 ? value.toPrecision(3)+'%' : |
|
value > 1 ? value.toPrecision(2)+'%' : |
|
value.toPrecision(1)+'%'; |
|
} |
|
|
|
// Function to calculate and display histogram. |
|
// It first calculates the histogram of the primary `key`. |
|
// For each key it calculates a secondary histogram of the seconday `linkKey`. |
|
function displayHistogram(words, key, linkKey) { |
|
|
|
// Calculate `key` histogram of words, sorted by the frequenzy. |
|
var grouped = _.groupBy(words, key); |
|
var histogram = _(grouped) |
|
.mapValues('length') |
|
.toPairs() |
|
.sortBy([ 1, 0 ]) |
|
.reverse() |
|
.value(); |
|
|
|
// Get top-10 and bottom-10 parts of the histogram. |
|
var top = histogram.slice(0, 10); |
|
var bottom = histogram.slice(-10); |
|
|
|
// Combine top and bottom parts to display. |
|
var entries = top.concat([['--']]).concat(bottom); |
|
|
|
// For each entry, calculate top-6 of secondary `linkKey` histogram. |
|
entries.forEach(entry => { |
|
entry[2] = percent(entry[1] / words.length); |
|
var linked = _.map(grouped[entry[0]], linkKey); |
|
entry[3] = |
|
_(linked) |
|
.groupBy() |
|
.mapValues('length') |
|
.toPairs() |
|
.sortBy([ 1, 0 ]) |
|
.reverse() |
|
// Show percent values for each item. |
|
.map(item => `${item[0]} (${percent(item[1]/linked.length)})`) |
|
.slice(0, 6) |
|
.join(', '); |
|
}); |
|
|
|
// Display table of results. |
|
console.log('-- %s --', key); |
|
console.log(); |
|
console.log(entries.map(entry => entry.join('\t')).join('\n')); |
|
console.log(); |
|
console.log('#words :', words.length); |
|
console.log('#histogram :', histogram.length); |
|
console.log(); |
|
|
|
} |
|
|
|
|
|
// Array to store all word objects we encounter. |
|
var words = []; |
|
|
|
// Parse CoNLL-U file. |
|
fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu') |
|
.pipe(conllu()) |
|
.on('data', sentence => { |
|
// Collect all words and lemmas in lowercase (except punctuation/numbers). |
|
sentence.getSequence() |
|
.filter(word => [ 'PUNCT', 'NUM' ].indexOf(word.upostag) === -1) |
|
.forEach(word => { |
|
// Using lowercase so "Haus" and "haus" are counted together. |
|
word.form = word.form.toLowerCase(); |
|
word.lemma = word.lemma.toLowerCase(); |
|
words.push(word); |
|
}); |
|
}) |
|
.on('end', () => { |
|
// Calculate and display histograms of words vs. lemmas and vice versa. |
|
displayHistogram(words, 'form', 'lemma'); |
|
displayHistogram(words, 'lemma', 'form'); |
|
console.log('-- done --'); |
|
}); |