1
0
mirror of https://github.com/koloml/philomena-tagging-assistant.git synced 2026-06-23 18:22:20 +00:00

Merge pull request #172 from koloml/bugfix/tags-in-forums-with-encoded-text

Fixed wrong parsing of tag links inside forum posts
This commit is contained in:
2026-06-13 19:55:32 -04:00
committed by GitHub
4 changed files with 235 additions and 43 deletions

View File

@@ -41,21 +41,27 @@ export class QuotedTermToken extends Token {
}
static decode(value: string): string {
return value.replace(/\\([\\"])/g, "$1");
return value
.replaceAll(/\\([\\"])/g, "$1")
.replaceAll(/^"|"$/g, '');
}
static encode(value: string): string {
return value.replace(/[\\"]/g, "\\$&");
return `"${value.replaceAll(/[\\"]/g, "\\$&")}"`;
}
}
export class TermToken extends Token {
}
type MatchResultCarry = {
interface MatchResultCarry {
match?: RegExpMatchArray | null
}
interface SuccessfulMatchResultCarry {
match: RegExpMatchArray;
}
/**
* Search query tokenizer. Should mostly work for the cases of parsing and finding the selected term for
* auto-completion. Follows the rules described in the Philomena booru engine.
@@ -94,26 +100,26 @@ export class QueryLexer {
}
if (this.#match(QueryLexer.#negotiationOperator, result)) {
tokens.push(new NotToken(this.#index, result.match![0]));
this.#index += result.match![0].length;
tokens.push(new NotToken(this.#index, result.match[0]));
this.#index += result.match[0].length;
continue;
}
if (this.#match(QueryLexer.#andOperator, result)) {
tokens.push(new AndToken(this.#index, result.match![0]));
this.#index += result.match![0].length;
tokens.push(new AndToken(this.#index, result.match[0]));
this.#index += result.match[0].length;
continue;
}
if (this.#match(QueryLexer.#orOperator, result)) {
tokens.push(new OrToken(this.#index, result.match![0]));
this.#index += result.match![0].length;
tokens.push(new OrToken(this.#index, result.match[0]));
this.#index += result.match[0].length;
continue;
}
if (this.#match(QueryLexer.#notOperator, result)) {
tokens.push(new NotToken(this.#index, result.match![0]));
this.#index += result.match![0].length;
tokens.push(new NotToken(this.#index, result.match[0]));
this.#index += result.match[0].length;
continue;
}
@@ -130,26 +136,26 @@ export class QueryLexer {
}
if (this.#match(QueryLexer.#boostOperator, result)) {
tokens.push(new BoostToken(this.#index, result.match![0]));
this.#index += result.match![0].length;
tokens.push(new BoostToken(this.#index, result.match[0]));
this.#index += result.match[0].length;
continue;
}
if (this.#match(QueryLexer.#whitespaces, result)) {
this.#index += result.match![0].length;
this.#index += result.match[0].length;
continue;
}
if (this.#match(QueryLexer.#quotedText, result)) {
tokens.push(new QuotedTermToken(this.#index, result.match![0], result.match![1]));
this.#index += result.match![0].length;
tokens.push(new QuotedTermToken(this.#index, result.match[0], result.match[1]));
this.#index += result.match[0].length;
continue;
}
dirtyText = this.#parseDirtyText(this.#index);
if (dirtyText) {
tokens.push(new TermToken(this.#index, dirtyText));
tokens.push(new TermToken(this.#index, dirtyText.trim()));
this.#index += dirtyText.length;
continue;
}
@@ -168,7 +174,7 @@ export class QueryLexer {
*
* @return Is there a match?
*/
#match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): boolean {
#match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry {
return this.#matchAt(targetRegExp, this.#index, resultCarrier);
}
@@ -181,9 +187,9 @@ export class QueryLexer {
*
* @return Is there a match?
*/
#matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): boolean {
#matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry {
targetRegExp.lastIndex = index;
resultCarrier.match = this.#value.match(targetRegExp);
resultCarrier.match = targetRegExp.exec(this.#value);
return resultCarrier.match !== null;
}
@@ -207,16 +213,10 @@ export class QueryLexer {
break;
}
if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) {
resultValue += result.match![0];
index += result.match![0].length;
continue;
}
if (this.#value[index] === QueryLexer.#bracketsOpenCharacter) {
let bracketsContent = QueryLexer.#bracketsOpenCharacter + this.#parseDirtyText(index + 1);
if (this.#value[index + bracketsContent.length + 1] === QueryLexer.#bracketsCloseCharacter) {
if (this.#value[index + bracketsContent.length] === QueryLexer.#bracketsCloseCharacter) {
bracketsContent += QueryLexer.#bracketsCloseCharacter;
}
@@ -227,22 +227,28 @@ export class QueryLexer {
continue;
}
if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) {
resultValue += result.match[0];
index += result.match[0].length;
continue;
}
break;
}
return resultValue;
}
static #commaCharacter = ',';
static #negotiationOperator = /[!-]/y;
static #andOperator = /\s+(?:AND|&&)\s+/y;
static #orOperator = /\s+(?:OR|\|\|)\s+/y;
static #notOperator = /NOT\s+/y;
static #bracketsOpenCharacter = "(";
static #bracketsCloseCharacter = ")";
static #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y;
static #whitespaces = /\s+/y;
static #quotedText = /"((?:\\.|[^\\"])+)"/y;
static #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s+(?:\)|\^[+-]?\d+(?:\.\d+)?)/y;
static #dirtyTextContent = /\\.|[^()]/y;
static readonly #commaCharacter = ',';
static readonly #negotiationOperator = /[!-]/y;
static readonly #andOperator = /\s+(?:AND|&&)\s+/y;
static readonly #orOperator = /\s+(?:OR|\|\|)\s+/y;
static readonly #notOperator = /NOT\s+/y;
static readonly #bracketsOpenCharacter = "(";
static readonly #bracketsCloseCharacter = ")";
static readonly #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y;
static readonly #whitespaces = /\s+/y;
static readonly #quotedText = /"\s*((?:\\.|[^\\"])+?)\s*"/y;
static readonly #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s*(?:\)|\^[+-]?\d+(?:\.\d+)?)/y;
static readonly #dirtyTextContent = /\\.|[^()]/y;
}

View File

@@ -42,7 +42,7 @@ const tagLinkRegExp = /\/tags\/(?<encodedTagName>[^/?#]+)/;
*
* @see https://github.com/philomena-dev/philomena/blob/6086757b654da8792ae52adb2a2f501ea6c30d12/lib/philomena/slug.ex#L52-L57
*/
const slugEncodedCharacters: Map<string, string> = new Map([
export const slugEncodedCharacters: Map<string, string> = new Map([
['-dash-', '-'],
['-fwslash-', '/'],
['-bwslash-', '\\'],
@@ -101,9 +101,8 @@ export function resolveTagNameFromLink(tagLink: URL): string | null {
}
return decodeURIComponent(encodedTagName)
.replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match)
.replaceAll('-', ' ')
.replaceAll('+', ' ');
.replaceAll('+', ' ')
.replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match);
}
/**

View File

@@ -0,0 +1,111 @@
import {
AndToken,
BoostToken,
GroupEndToken,
GroupStartToken,
NotToken,
OrToken,
QueryLexer,
QuotedTermToken,
TermToken,
Token
} from "$lib/philomena/search/QueryLexer";
describe('QueryLexer', () => {
function parseQuery(query: string): Token[] {
return new QueryLexer(query).parse();
}
function parseQueryTypes(query: string): (typeof Token)[] {
return parseQuery(query)
.map(term => (term.constructor as any) as typeof Token);
}
it('should properly parse different kinds of queries', () => {
expect(parseQueryTypes('safe')).toEqual([TermToken]);
expect(parseQueryTypes('safe^1')).toEqual([TermToken, BoostToken]);
expect(parseQueryTypes('safe, avali')).toEqual([TermToken, AndToken, TermToken]);
expect(parseQueryTypes('!avali')).toEqual([NotToken, TermToken]);
expect(parseQueryTypes('avali || 4 ears')).toEqual([TermToken, OrToken, TermToken]);
expect(parseQueryTypes('avali && !4 ears')).toEqual([TermToken, AndToken, NotToken, TermToken]);
expect(parseQueryTypes('avali AND (NOT 4 ears OR -3 fingers)')).toEqual([
TermToken, AndToken, GroupStartToken, NotToken, TermToken, OrToken, NotToken, TermToken, GroupEndToken,
]);
});
it('should not treat parentheses as groups inside the term', () => {
expect(parseQueryTypes('!(experiment (casualties unknown) || milky (casualties unknown))')).toEqual([
NotToken, GroupStartToken, TermToken, OrToken, TermToken, GroupEndToken,
]);
});
it('should accept any amount of whitespaces between different tokens', () => {
expect(parseQueryTypes('! ( avali , experiment (casualties unknown) ) && safe')).toEqual([
NotToken, GroupStartToken, TermToken, AndToken, TermToken, GroupEndToken, AndToken, TermToken,
]);
});
it('should trim whitespaces inside the terms, even in quoted ones', () => {
const [termWithSpaces] = parseQuery(' avali ');
expect(termWithSpaces.value).toBe('avali');
const [quotedTermWithSpaces] = parseQuery(' " avali " ');
expect(quotedTermWithSpaces instanceof QuotedTermToken && quotedTermWithSpaces.decodedValue || new Error('Wrong token')).toBe('avali');
});
it('should properly differentiate between word-like operators and parts of tags', () => {
expect(parseQueryTypes('safe AND sound')).toEqual([TermToken, AndToken, TermToken]);
expect(parseQueryTypes('NOT safe AND dangerous')).toEqual([NotToken, TermToken, AndToken, TermToken]);
});
it('should only detect word-like operators when spaces are in place', () => {
// Require whitespace between operator and other tokens
expect(parseQueryTypes('NOT safeANDsound')).toEqual([NotToken, TermToken]);
// If none are there, just should treat it as a part of a term
expect(parseQuery('safeAND sound')[0].value).toEqual('safeAND sound');
// All operators should be in all caps, otherwise it's just a term
const [lowercaseOperatorWords] = parseQuery('avali are cute and you know it or else');
expect(lowercaseOperatorWords.value).toBe('avali are cute and you know it or else');
// And if it in caps, but part of some word, then it's just a word
const [wordsInCapsContainingOperators] = parseQuery('THAT POOR KNOT IS PLAIN AS SAND');
expect(wordsInCapsContainingOperators.value).toBe('THAT POOR KNOT IS PLAIN AS SAND');
});
it('should not treat any operators inside the quoted term as actual operators', () => {
const tokens = parseQuery('"this AND that OR these NOT there || () && ^123"');
const [quotedTermToken] = tokens;
expect(tokens).toHaveLength(1);
expect(quotedTermToken instanceof QuotedTermToken && quotedTermToken.decodedValue || null)
.toBe('this AND that OR these NOT there || () && ^123');
});
describe('QuotedTermToken', () => {
it('should decode and encode quotes and backslash', () => {
const encodedQuote = `"term with \\\" inside of it"`;
const decodedQuote = 'term with " inside of it';
expect(QuotedTermToken.decode(encodedQuote)).toBe(decodedQuote);
expect(QuotedTermToken.encode(decodedQuote)).toBe(encodedQuote);
const encodedBackslash = `"term with \\\\ inside of it"`;
const decodedBackslash = 'term with \\ inside of it';
expect(QuotedTermToken.decode(encodedBackslash)).toBe(decodedBackslash);
expect(QuotedTermToken.encode(decodedBackslash)).toBe(encodedBackslash);
});
it('should not care for anything else', () => {
const encodedTerm = '"operators: , && || AND OR NOT ! ^ ? *"';
const decodedTerm = 'operators: , && || AND OR NOT ! ^ ? *';
expect(QuotedTermToken.decode(encodedTerm)).toBe(decodedTerm);
expect(QuotedTermToken.encode(decodedTerm)).toBe(encodedTerm);
});
});
});

View File

@@ -0,0 +1,76 @@
import { URL } from 'url';
import { resolveTagNameFromLink, slugEncodedCharacters } from '$lib/philomena/tag-utils';
describe('tag-utils', () => {
const origin = 'https://furbooru.org';
describe('resolveTagNameFromLink', () => {
function resolveFromSearchQuery(encodedQuery: string): string | null {
return resolveTagNameFromLink(new URL(`/search?q=${encodedQuery}`, origin));
}
describe('Parsing from /search/?q=tag links', () => {
it('should resolve a single tag from /search URLs', () => {
expect(resolveFromSearchQuery('safe')).toBe('safe');
});
it('should return null for queries with multiple comma-separated tags', () => {
// Comma acts as a separator in the query, resulting in multiple tokens
expect(resolveFromSearchQuery('safe, suggestive')).toBe(null);
});
it('should return null if query is empty or not a term', () => {
expect(resolveFromSearchQuery('')).toBe(null);
expect(resolveFromSearchQuery('!')).toBe(null);
});
it('should properly treat parentheses in the query with single tag', () => {
// Parentheses are operators in the query language, but when inside the tag name, they should still be properly
// working.
expect(resolveFromSearchQuery('experiment (casualties unknown)')).toBe('experiment (casualties unknown)');
});
it('should properly resolve queries with encoded characters', () => {
expect(resolveFromSearchQuery('pok%C3%A9mon')).toBe('pokémon');
});
it('should unquote quoted term', () => {
expect(resolveFromSearchQuery('"experiment (casualties unknown)"')).toBe('experiment (casualties unknown)')
expect(resolveFromSearchQuery('"single tag, really"')).toBe('single tag, really');
});
})
describe('Parsing from /tags/name links', () => {
function resolveFromTagLink(encodedTagName: string): string | null {
return resolveTagNameFromLink(new URL(`/tags/${encodedTagName}`, origin));
}
it('should resolve a single tag', () => {
expect(resolveFromTagLink('safe')).toBe('safe');
});
it('should only read the tag page even if query is provided', () => {
expect(resolveFromTagLink('grotesque?q=explicit')).toBe('grotesque');
});
it('should properly resolve links with encoded characters', () => {
expect(resolveFromTagLink('pok%C3%A9mon')).toBe('pokémon');
});
it('should decoded slug-encoded characters', () => {
// More common example where tag is.
expect(resolveFromTagLink(`namespace-colon-tag+name`)).toBe('namespace:tag name');
// Testing the whole list of encoded characters.
for (const [encodedCharacter, decodedCharacter] of slugEncodedCharacters.entries()) {
expect(resolveFromTagLink(`test+symbol${encodedCharacter}without+spaces`)).toBe(`test symbol${decodedCharacter}without spaces`);
expect(resolveFromTagLink(`test+symbol+${encodedCharacter}+with+spaces`)).toBe(`test symbol ${decodedCharacter} with spaces`);
}
});
});
it('should return null for unsupported URLs', () => {
expect(resolveTagNameFromLink(new URL('/pages/example', origin))).toBe(null);
});
});
});