Reworked "create-freq.js", and changed corpus

seth-js · Oct 22, 2023 · ab437c9 · ab437c9
1 parent e623ba2
commit ab437c9
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 46 deletions.
diff --git a/src/create-freq.js b/src/create-freq.js
@@ -22,24 +22,27 @@ for (const [lemma, info] of Object.entries(lemmaDict)) {
   }
 }
 
-// const sentences = JSON.parse(readFileSync('data/sentences/opensubtitles-es-sentences.json'));
+const sentences = JSON.parse(readFileSync('data/sentences/netflix-es-sentences.json'));
 
-const sentences = ['¡Un mundo de espadas y hechicería!', 'si Dios quiere.'];
+// const sentences = ['¡Un mundo de espadas y hechicería!', 'si Dios quiere.'];
 
-const freqList = {};
+const freqList = new Map();
 let totalWords = 0;
 let missedWords = 0;
+let sentenceLimit = 5000000;
+
+console.log('Parsing corpus...');
 
 let index = 0;
 for (const sentence of sentences) {
   index++;
   // log progress the first time, then every 100,000 sentences, and the last one
   if (index === 1 || index % 100000 === 0 || index === sentences.length) {
-    console.log(`(${index}/${sentences.length})`);
+    console.log(`(${index.toLocaleString()} of ${sentences.length.toLocaleString()} sentences parsed)`);
   }
 
-  // stop at 5 million
-  if (index === 5000000) {
+  if (index === sentenceLimit) {
+    console.log(`(${sentenceLimit.toLocaleString()} sentence limit reached. moving on...)`)
     break;
   }
 
@@ -51,69 +54,78 @@ for (const sentence of sentences) {
   for (const { word, surface } of customWords) {
     if (word !== '' && /\p{L}/u.test(word) && /\p{L}/u.test(surface) && !nameDict.has(word)) {
       totalWords++;
-      freqList[word] = (freqList[word] || 0) + 1;
+
+      if (freqList.has(word)) {
+        freqList.set(word, freqList.get(word) + 1);
+      } else {
+        freqList.set(word, 1);
+      }
     }
 
-    if (word === '' && /\p{L}/u.test(word) && /\p{L}/u.test(surface)) {
+    if (word === '' && /\p{L}/u.test(surface)) {
       missedWords++;
     }
   }
 }
 
-const freqArr = Object.entries(freqList)
-  .filter(([word]) => lemmaDict[word])
-  .map(([word, count]) => ({ word, count }))
-  .sort((a, b) => b.count - a.count);
+console.log('Done parsing.');
+
+const freqArr = [];
+
+for (const [word, count] of freqList) {
+  freqArr.push({ word, count });
+}
 
-const totalCount = freqArr.reduce((sum, entry) => sum + entry.count, 0);
+freqArr.sort((a, b) => b.count - a.count);
 
-const thresholds = [0.95, 0.98, 0.99];
-const coverage = new Map();
-const thousand = [];
+const nineFive = [];
+const nineEight = [];
+const nineNine = [];
+const thousand = {};
 
 let percSoFar = 0.0;
 
 for (const { word, count } of freqArr) {
-  percSoFar += count / totalCount;
+  percSoFar += count / totalWords;
 
-  for (const threshold of thresholds) {
-    if (threshold >= percSoFar) {
-      coverage.set(threshold, coverage.get(threshold) || new Set());
-      coverage.get(threshold).add(word);
-    }
+  if (0.95 >= percSoFar) {
+    nineFive.push(word);
   }
 
-  if (coverage.get(0.95).size === 1000) {
-    thousand.push(...coverage.get(0.95));
-    console.log(`The top 1000 words cover ${+(percSoFar * 100).toFixed(2)}%.`);
+  if (0.98 >= percSoFar) {
+    nineEight.push(word);
   }
-}
 
-const hundredCoverage = {};
+  if (0.99 >= percSoFar) {
+    nineNine.push(word);
+  }
 
-for (const { word, count } of freqArr) {
-  hundredCoverage[word] = count;
+  if (nineFive.length === 1000) {
+    thousand.words = [...nineFive];
+    thousand.coverage = `${+(percSoFar * 100).toFixed(2)}%`;
+  }
 }
 
 const message = `
-Your corpus is made up of ${totalCount} words.
-${coverage.get(0.95).size} words cover 95%.
-${coverage.get(0.98).size} words cover 98%.
-${coverage.get(0.99).size} words cover 99%.
+Your corpus is made up of ${totalWords.toLocaleString()} words.
+The 1000 most common words cover ${thousand.coverage}.
+${nineFive.length} words cover 95%.
+${nineEight.length} words cover 98%.
+${nineNine.length} words cover 99%.
 
-Frequency list contains ${freqArr.length} unique word(s).
+Frequency list contains ${freqArr.length.toLocaleString()} unique word(s).
 
 ${((totalWords - missedWords) / totalWords * 100).toFixed(2)}% of words were able to find a definition.
 `;
 
 console.log(message);
 
 const frequencies = {
-  'nine-five': Array.from(coverage.get(0.95)),
-  'nine-eight': Array.from(coverage.get(0.98)),
-  'nine-nine': Array.from(coverage.get(0.99)),
+  'nine-five': nineFive,
+  'nine-eight': nineEight,
+  'nine-nine': nineNine,
   '1k': thousand,
-  'hundred': hundredCoverage,
+  'hundred': freqArr,
 };
 
 for (const [file, data] of Object.entries(frequencies)) {
@@ -123,7 +135,7 @@ for (const [file, data] of Object.entries(frequencies)) {
 writeFileSync('data/freq/info.txt', message);
 
 function getWords(sentence) {
-  return sentence.split(/(?=\s)|(?<=\s)|(?=[.,!?—\]\[\)":¡])|(?<=[.,!?—\]\[\(":¡])/g)
+  return sentence.replace(/^-/, '- ').split(/(?=\s)|(?<=\s)|(?=[.,!?—\]\[\)":¡¿…])|(?<=[.,!?—\]\[\(":¡¿…])/g)
     .map(word => {
       if (/[.,!?:"]|\s/.test(word)) {
         return { word, lemma: word };

diff --git a/src/make-yomichan.js b/src/make-yomichan.js
@@ -10,11 +10,14 @@ const lemmaDict = JSON.parse(readFileSync('data/tidy/spanish-lemmas.json'));
 const formDict = JSON.parse(readFileSync('data/tidy/spanish-forms.json'));
 
 let popularDict;
-let frequencies = {};
+const frequencies = new Map();
 
-if (existsSync('data/freq/nine-five.json') && existsSync('data/freq/hundred.json')) {
-  popularDict = new Set(JSON.parse(readFileSync('data/freq/nine-five.json')));
-  frequencies = JSON.parse(readFileSync('data/freq/hundred.json'));
+if (existsSync('data/freq/nine-eight.json') && existsSync('data/freq/hundred.json')) {
+  popularDict = new Set(JSON.parse(readFileSync('data/freq/nine-eight.json')));
+
+  for (const { word, count } of JSON.parse(readFileSync('data/freq/hundred.json'))) {
+    frequencies.set(word, count);
+  }
 }
 
 const lemmaYomi = [];
@@ -29,7 +32,7 @@ for (const [lemma, infoMap] of allInfo) {
     const tags = [pos, ...(info.tags || [])].join(' ');
     const ipa = info.ipa || '';
     const popular = popularDict && popularDict.has(lemma) ? 'P' : '';
-    const freq = frequencies[lemma] || 0;
+    const freq = frequencies.get(lemma) || 0;
 
     // term, ipa, tags, rules, frequency, definitions, sequence, tags2
     lemmaYomi.push([lemma, ipa, tags, '', freq, glosses, 0, popular]);
@@ -55,7 +58,10 @@ for (const [form, allInfo] of Object.entries(formDict)) {
   }
 }
 
-const tagBank = Array.from(allPOS).map((pos) => [pos, 'partOfSpeech', -3, pos, 0]);
+const tagBank = [
+  ['P', 'popular', -10, 'popular term', 10],
+  ...Array.from(allPOS).map((pos) => [pos, 'partOfSpeech', -3, pos, 0])
+];
 
 const customTags = ['non-lemma', 'masculine', 'feminine', 'neuter'];
 
@@ -80,7 +86,7 @@ while (allYomi.length > 0) {
   writeFileSync(`${yomiPath}/term_bank_${bankIndex}.json`, JSON.stringify(batch));
 }
 
-const freqYomi = Object.entries(frequencies).map(([word, count]) => [word, 'freq', count]);
+const freqYomi = [...frequencies.entries()].map(([word, count]) => [word, 'freq', count]);
 
 writeFileSync(`${yomiPath}/term_meta_bank_1.json`, JSON.stringify(freqYomi));