-
Notifications
You must be signed in to change notification settings - Fork 59.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #33337 from github/repo-sync
Repo sync
- Loading branch information
Showing
7 changed files
with
448 additions
and
16 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
import { cuss } from 'cuss' | ||
import { cuss as cussPt } from 'cuss/pt' | ||
import { cuss as cussFr } from 'cuss/fr' | ||
import { cuss as cussEs } from 'cuss/es' | ||
import { Language } from '@horizon-rs/language-guesser' | ||
|
||
const language = new Language() | ||
|
||
// Exported for the debugging CLI script | ||
export const SIGNAL_RATINGS = [ | ||
{ | ||
reduction: 1.0, | ||
name: 'email-only', | ||
validator: (comment) => isEmailOnly(comment), | ||
}, | ||
{ | ||
reduction: 0.2, | ||
name: 'contains-email', | ||
validator: (comment) => isContainingEmail(comment), | ||
}, | ||
{ | ||
reduction: 0.1, | ||
name: 'url-only', | ||
validator: (comment) => isURL(comment), | ||
}, | ||
{ | ||
reduction: 0.1, | ||
name: 'numbers-only', | ||
validator: (comment) => isNumbersOnly(comment), | ||
}, | ||
{ | ||
reduction: 0.1, | ||
name: 'all-uppercase', | ||
validator: (comment) => isAllUppercase(comment), | ||
}, | ||
{ | ||
reduction: 0.1, | ||
name: 'too-short', | ||
validator: (comment) => isTooShort(comment), | ||
}, | ||
{ | ||
reduction: 0.2, | ||
name: 'not-language', | ||
validator: (comment, language) => isNotLanguage(comment, language), | ||
}, | ||
{ | ||
reduction: 0.3, | ||
name: 'cuss-words-likely', | ||
validator: (comment, language) => isLikelyCussWords(comment, language), | ||
}, | ||
{ | ||
reduction: 0.1, | ||
name: 'cuss-words-maybe', | ||
validator: (comment, language) => isMaybeCussWords(comment, language), | ||
}, | ||
] | ||
|
||
export async function analyzeComment(text, language = 'en') { | ||
const signals = [] | ||
let rating = 1.0 | ||
for (const { reduction, name, validator } of SIGNAL_RATINGS) { | ||
if (validator(text, language)) { | ||
signals.push(name) | ||
rating -= reduction | ||
} | ||
if (rating <= 0) break | ||
} | ||
|
||
return { signals, rating } | ||
} | ||
|
||
function isEmailOnly(text) { | ||
if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) { | ||
const atSigns = text.split('@').length | ||
if (atSigns === 2) { | ||
return true | ||
} | ||
} | ||
} | ||
|
||
function isContainingEmail(text) { | ||
if (text.includes('@') && !isEmailOnly(text)) { | ||
// Don't use splitWords() here because `[email protected]` will be | ||
// split up into ['foo', 'example.com']. | ||
return text.split(/\s+/g).some((word) => isEmailOnly(word)) | ||
} | ||
return false | ||
} | ||
|
||
function isURL(text) { | ||
if (!text.trim().includes(' ')) { | ||
if (URL.canParse(text.trim())) return true | ||
} | ||
} | ||
|
||
function isNumbersOnly(text) { | ||
return /^\d+$/.test(text.replace(/\s/g, '')) | ||
} | ||
|
||
function isAllUppercase(text) { | ||
return /[A-Z]/.test(text) && text === text.toUpperCase() | ||
} | ||
|
||
function isTooShort(text) { | ||
const split = text.trim().split(/\s+/) | ||
if (split.length <= 1) { | ||
// return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text) | ||
return true | ||
} | ||
} | ||
|
||
function isNotLanguage(text, language_) { | ||
const bestGuess = language.guessBest(text.trim()) | ||
if (!bestGuess) return true // Can happen if the text is just whitespace | ||
// @horizon-rs/language-guesser is based on tri-grams and can lead | ||
// to false positives. For example, it thinks that 'Thamk you ❤️🙏' is | ||
// Haitian! And that 'I wanne robux 1000' is Polish! | ||
// But that's because they are short and there's not enough clues to | ||
// guess what language it is. You and I might know those are actually | ||
// attempts to be English, despite the spelling. | ||
// But are they useful comments? Given that this is just a signal, | ||
// and not a hard blocker, it's more of a clue than a fact. | ||
return bestGuess.alpha2 !== language_ | ||
} | ||
|
||
function getCussWords(lang) { | ||
switch (lang) { | ||
case 'pt': | ||
return cussPt | ||
case 'fr': | ||
return cussFr | ||
case 'es': | ||
return cussEs | ||
default: | ||
return cuss | ||
} | ||
} | ||
|
||
function isLikelyCussWords(text, language_, rating = 2) { | ||
const cussWords = getCussWords(language_) | ||
for (const word of splitWords(text, language_ || 'en')) { | ||
if (cussWords[word] && cussWords[word] === rating) { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
function isMaybeCussWords(text, language_) { | ||
return isLikelyCussWords(text, language_, 1) | ||
} | ||
|
||
const segmenter = new Intl.Segmenter([], { granularity: 'word' }) | ||
|
||
function splitWords(text) { | ||
const segmentedText = segmenter.segment(text) | ||
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
/** | ||
* This script is used to analyze posted survey comments in a CSV file. | ||
* The CSV file is expected to have come from the Azure Data Explorer | ||
* after having queries the `docs_v0_survey_event` table. | ||
* | ||
* | ||
*/ | ||
|
||
import fs from 'node:fs' | ||
import util from 'node:util' | ||
|
||
import chalk from 'chalk' | ||
import { parse } from 'csv-parse' | ||
import { program } from 'commander' | ||
|
||
import { SIGNAL_RATINGS } from '../analyze-comment' | ||
|
||
type Options = { | ||
outputFile: string | ||
limit: string | ||
random: boolean | ||
} | ||
program | ||
.description('Analyze survey comments in a CSV file') | ||
.option('-o, --output-file <path>', 'path to the output', 'stdout') | ||
.option('--limit <number>', 'limit number of records analyzed', 'Infinity') | ||
.option( | ||
'--random', | ||
'randomize the lines analyzed (useful when limit is less than size of CSV)', | ||
false, | ||
) | ||
.argument('<csv-files...>', 'path to the exported CSV file') | ||
.action(main) | ||
|
||
program.parse(process.argv) | ||
|
||
async function main(csvFile: string[], options: Options) { | ||
for (const file of csvFile) { | ||
await analyzeFile(file, options) | ||
} | ||
} | ||
|
||
type Record = { | ||
[key: string]: string | number | ||
} | ||
|
||
async function analyzeFile(csvFile: string, options: Options) { | ||
const parser = fs.createReadStream(csvFile).pipe( | ||
parse({ | ||
// Needed when parsing CSVs from the Azure Data Explorer | ||
bom: true, | ||
}), | ||
) | ||
let headers: null | string[] = null | ||
const records: Record[] = [] | ||
for await (const record of parser) { | ||
if (headers === null) { | ||
headers = record as string[] | ||
} else { | ||
const obj: { | ||
[key: string]: string | ||
} = {} | ||
for (let i = 0; i < headers.length; i++) { | ||
obj[headers[i]] = record[i] | ||
} | ||
records.push(obj) | ||
} | ||
} | ||
|
||
const limit = parseInt(options.limit) | ||
if (options.random) { | ||
records.sort(() => Math.random() - 0.5) | ||
} | ||
for (const record of records.slice(0, limit)) { | ||
const language = record.survey_comment_language || 'en' | ||
let rating = 1.0 | ||
let first = true | ||
for (const { reduction, name, validator } of SIGNAL_RATINGS) { | ||
const hit = validator(record.survey_comment, language) | ||
if (hit) { | ||
rating -= reduction | ||
if (first) { | ||
console.log(util.inspect(record.survey_comment)) | ||
first = false | ||
} | ||
console.log(name.padEnd(10), reduction) | ||
if (rating <= 0.0) { | ||
break | ||
} | ||
} | ||
} | ||
if (rating !== 1.0) { | ||
console.log(chalk.yellow(`Rating: ${rating}`)) | ||
} else { | ||
console.log(chalk.green('No rating reduction')) | ||
} | ||
|
||
console.log('\n') | ||
} | ||
} |
Oops, something went wrong.