mirror of
https://github.com/koloml/philomena-tagging-assistant.git
synced 2026-06-23 18:22:20 +00:00
Merge pull request #172 from koloml/bugfix/tags-in-forums-with-encoded-text
Fixed wrong parsing of tag links inside forum posts
This commit is contained in:
@@ -41,21 +41,27 @@ export class QuotedTermToken extends Token {
|
||||
}
|
||||
|
||||
static decode(value: string): string {
|
||||
return value.replace(/\\([\\"])/g, "$1");
|
||||
return value
|
||||
.replaceAll(/\\([\\"])/g, "$1")
|
||||
.replaceAll(/^"|"$/g, '');
|
||||
}
|
||||
|
||||
static encode(value: string): string {
|
||||
return value.replace(/[\\"]/g, "\\$&");
|
||||
return `"${value.replaceAll(/[\\"]/g, "\\$&")}"`;
|
||||
}
|
||||
}
|
||||
|
||||
export class TermToken extends Token {
|
||||
}
|
||||
|
||||
type MatchResultCarry = {
|
||||
interface MatchResultCarry {
|
||||
match?: RegExpMatchArray | null
|
||||
}
|
||||
|
||||
interface SuccessfulMatchResultCarry {
|
||||
match: RegExpMatchArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search query tokenizer. Should mostly work for the cases of parsing and finding the selected term for
|
||||
* auto-completion. Follows the rules described in the Philomena booru engine.
|
||||
@@ -94,26 +100,26 @@ export class QueryLexer {
|
||||
}
|
||||
|
||||
if (this.#match(QueryLexer.#negotiationOperator, result)) {
|
||||
tokens.push(new NotToken(this.#index, result.match![0]));
|
||||
this.#index += result.match![0].length;
|
||||
tokens.push(new NotToken(this.#index, result.match[0]));
|
||||
this.#index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.#match(QueryLexer.#andOperator, result)) {
|
||||
tokens.push(new AndToken(this.#index, result.match![0]));
|
||||
this.#index += result.match![0].length;
|
||||
tokens.push(new AndToken(this.#index, result.match[0]));
|
||||
this.#index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.#match(QueryLexer.#orOperator, result)) {
|
||||
tokens.push(new OrToken(this.#index, result.match![0]));
|
||||
this.#index += result.match![0].length;
|
||||
tokens.push(new OrToken(this.#index, result.match[0]));
|
||||
this.#index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.#match(QueryLexer.#notOperator, result)) {
|
||||
tokens.push(new NotToken(this.#index, result.match![0]));
|
||||
this.#index += result.match![0].length;
|
||||
tokens.push(new NotToken(this.#index, result.match[0]));
|
||||
this.#index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -130,26 +136,26 @@ export class QueryLexer {
|
||||
}
|
||||
|
||||
if (this.#match(QueryLexer.#boostOperator, result)) {
|
||||
tokens.push(new BoostToken(this.#index, result.match![0]));
|
||||
this.#index += result.match![0].length;
|
||||
tokens.push(new BoostToken(this.#index, result.match[0]));
|
||||
this.#index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.#match(QueryLexer.#whitespaces, result)) {
|
||||
this.#index += result.match![0].length;
|
||||
this.#index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.#match(QueryLexer.#quotedText, result)) {
|
||||
tokens.push(new QuotedTermToken(this.#index, result.match![0], result.match![1]));
|
||||
this.#index += result.match![0].length;
|
||||
tokens.push(new QuotedTermToken(this.#index, result.match[0], result.match[1]));
|
||||
this.#index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
dirtyText = this.#parseDirtyText(this.#index);
|
||||
|
||||
if (dirtyText) {
|
||||
tokens.push(new TermToken(this.#index, dirtyText));
|
||||
tokens.push(new TermToken(this.#index, dirtyText.trim()));
|
||||
this.#index += dirtyText.length;
|
||||
continue;
|
||||
}
|
||||
@@ -168,7 +174,7 @@ export class QueryLexer {
|
||||
*
|
||||
* @return Is there a match?
|
||||
*/
|
||||
#match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): boolean {
|
||||
#match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry {
|
||||
return this.#matchAt(targetRegExp, this.#index, resultCarrier);
|
||||
}
|
||||
|
||||
@@ -181,9 +187,9 @@ export class QueryLexer {
|
||||
*
|
||||
* @return Is there a match?
|
||||
*/
|
||||
#matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): boolean {
|
||||
#matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry {
|
||||
targetRegExp.lastIndex = index;
|
||||
resultCarrier.match = this.#value.match(targetRegExp);
|
||||
resultCarrier.match = targetRegExp.exec(this.#value);
|
||||
|
||||
return resultCarrier.match !== null;
|
||||
}
|
||||
@@ -207,16 +213,10 @@ export class QueryLexer {
|
||||
break;
|
||||
}
|
||||
|
||||
if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) {
|
||||
resultValue += result.match![0];
|
||||
index += result.match![0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.#value[index] === QueryLexer.#bracketsOpenCharacter) {
|
||||
let bracketsContent = QueryLexer.#bracketsOpenCharacter + this.#parseDirtyText(index + 1);
|
||||
|
||||
if (this.#value[index + bracketsContent.length + 1] === QueryLexer.#bracketsCloseCharacter) {
|
||||
if (this.#value[index + bracketsContent.length] === QueryLexer.#bracketsCloseCharacter) {
|
||||
bracketsContent += QueryLexer.#bracketsCloseCharacter;
|
||||
}
|
||||
|
||||
@@ -227,22 +227,28 @@ export class QueryLexer {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) {
|
||||
resultValue += result.match[0];
|
||||
index += result.match[0].length;
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return resultValue;
|
||||
}
|
||||
|
||||
static #commaCharacter = ',';
|
||||
static #negotiationOperator = /[!-]/y;
|
||||
static #andOperator = /\s+(?:AND|&&)\s+/y;
|
||||
static #orOperator = /\s+(?:OR|\|\|)\s+/y;
|
||||
static #notOperator = /NOT\s+/y;
|
||||
static #bracketsOpenCharacter = "(";
|
||||
static #bracketsCloseCharacter = ")";
|
||||
static #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y;
|
||||
static #whitespaces = /\s+/y;
|
||||
static #quotedText = /"((?:\\.|[^\\"])+)"/y;
|
||||
static #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s+(?:\)|\^[+-]?\d+(?:\.\d+)?)/y;
|
||||
static #dirtyTextContent = /\\.|[^()]/y;
|
||||
static readonly #commaCharacter = ',';
|
||||
static readonly #negotiationOperator = /[!-]/y;
|
||||
static readonly #andOperator = /\s+(?:AND|&&)\s+/y;
|
||||
static readonly #orOperator = /\s+(?:OR|\|\|)\s+/y;
|
||||
static readonly #notOperator = /NOT\s+/y;
|
||||
static readonly #bracketsOpenCharacter = "(";
|
||||
static readonly #bracketsCloseCharacter = ")";
|
||||
static readonly #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y;
|
||||
static readonly #whitespaces = /\s+/y;
|
||||
static readonly #quotedText = /"\s*((?:\\.|[^\\"])+?)\s*"/y;
|
||||
static readonly #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s*(?:\)|\^[+-]?\d+(?:\.\d+)?)/y;
|
||||
static readonly #dirtyTextContent = /\\.|[^()]/y;
|
||||
}
|
||||
|
||||
@@ -42,7 +42,7 @@ const tagLinkRegExp = /\/tags\/(?<encodedTagName>[^/?#]+)/;
|
||||
*
|
||||
* @see https://github.com/philomena-dev/philomena/blob/6086757b654da8792ae52adb2a2f501ea6c30d12/lib/philomena/slug.ex#L52-L57
|
||||
*/
|
||||
const slugEncodedCharacters: Map<string, string> = new Map([
|
||||
export const slugEncodedCharacters: Map<string, string> = new Map([
|
||||
['-dash-', '-'],
|
||||
['-fwslash-', '/'],
|
||||
['-bwslash-', '\\'],
|
||||
@@ -101,9 +101,8 @@ export function resolveTagNameFromLink(tagLink: URL): string | null {
|
||||
}
|
||||
|
||||
return decodeURIComponent(encodedTagName)
|
||||
.replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match)
|
||||
.replaceAll('-', ' ')
|
||||
.replaceAll('+', ' ');
|
||||
.replaceAll('+', ' ')
|
||||
.replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
111
tests/lib/philomena/search/QueryLexer.spec.ts
Normal file
111
tests/lib/philomena/search/QueryLexer.spec.ts
Normal file
@@ -0,0 +1,111 @@
|
||||
import {
|
||||
AndToken,
|
||||
BoostToken,
|
||||
GroupEndToken,
|
||||
GroupStartToken,
|
||||
NotToken,
|
||||
OrToken,
|
||||
QueryLexer,
|
||||
QuotedTermToken,
|
||||
TermToken,
|
||||
Token
|
||||
} from "$lib/philomena/search/QueryLexer";
|
||||
|
||||
describe('QueryLexer', () => {
|
||||
function parseQuery(query: string): Token[] {
|
||||
return new QueryLexer(query).parse();
|
||||
}
|
||||
|
||||
function parseQueryTypes(query: string): (typeof Token)[] {
|
||||
return parseQuery(query)
|
||||
.map(term => (term.constructor as any) as typeof Token);
|
||||
}
|
||||
|
||||
it('should properly parse different kinds of queries', () => {
|
||||
expect(parseQueryTypes('safe')).toEqual([TermToken]);
|
||||
expect(parseQueryTypes('safe^1')).toEqual([TermToken, BoostToken]);
|
||||
expect(parseQueryTypes('safe, avali')).toEqual([TermToken, AndToken, TermToken]);
|
||||
expect(parseQueryTypes('!avali')).toEqual([NotToken, TermToken]);
|
||||
expect(parseQueryTypes('avali || 4 ears')).toEqual([TermToken, OrToken, TermToken]);
|
||||
expect(parseQueryTypes('avali && !4 ears')).toEqual([TermToken, AndToken, NotToken, TermToken]);
|
||||
|
||||
expect(parseQueryTypes('avali AND (NOT 4 ears OR -3 fingers)')).toEqual([
|
||||
TermToken, AndToken, GroupStartToken, NotToken, TermToken, OrToken, NotToken, TermToken, GroupEndToken,
|
||||
]);
|
||||
});
|
||||
|
||||
it('should not treat parentheses as groups inside the term', () => {
|
||||
expect(parseQueryTypes('!(experiment (casualties unknown) || milky (casualties unknown))')).toEqual([
|
||||
NotToken, GroupStartToken, TermToken, OrToken, TermToken, GroupEndToken,
|
||||
]);
|
||||
});
|
||||
|
||||
it('should accept any amount of whitespaces between different tokens', () => {
|
||||
expect(parseQueryTypes('! ( avali , experiment (casualties unknown) ) && safe')).toEqual([
|
||||
NotToken, GroupStartToken, TermToken, AndToken, TermToken, GroupEndToken, AndToken, TermToken,
|
||||
]);
|
||||
});
|
||||
|
||||
it('should trim whitespaces inside the terms, even in quoted ones', () => {
|
||||
const [termWithSpaces] = parseQuery(' avali ');
|
||||
expect(termWithSpaces.value).toBe('avali');
|
||||
|
||||
const [quotedTermWithSpaces] = parseQuery(' " avali " ');
|
||||
expect(quotedTermWithSpaces instanceof QuotedTermToken && quotedTermWithSpaces.decodedValue || new Error('Wrong token')).toBe('avali');
|
||||
});
|
||||
|
||||
it('should properly differentiate between word-like operators and parts of tags', () => {
|
||||
expect(parseQueryTypes('safe AND sound')).toEqual([TermToken, AndToken, TermToken]);
|
||||
expect(parseQueryTypes('NOT safe AND dangerous')).toEqual([NotToken, TermToken, AndToken, TermToken]);
|
||||
});
|
||||
|
||||
it('should only detect word-like operators when spaces are in place', () => {
|
||||
// Require whitespace between operator and other tokens
|
||||
expect(parseQueryTypes('NOT safeANDsound')).toEqual([NotToken, TermToken]);
|
||||
|
||||
// If none are there, just should treat it as a part of a term
|
||||
expect(parseQuery('safeAND sound')[0].value).toEqual('safeAND sound');
|
||||
|
||||
// All operators should be in all caps, otherwise it's just a term
|
||||
const [lowercaseOperatorWords] = parseQuery('avali are cute and you know it or else');
|
||||
expect(lowercaseOperatorWords.value).toBe('avali are cute and you know it or else');
|
||||
|
||||
// And if it in caps, but part of some word, then it's just a word
|
||||
const [wordsInCapsContainingOperators] = parseQuery('THAT POOR KNOT IS PLAIN AS SAND');
|
||||
expect(wordsInCapsContainingOperators.value).toBe('THAT POOR KNOT IS PLAIN AS SAND');
|
||||
});
|
||||
|
||||
it('should not treat any operators inside the quoted term as actual operators', () => {
|
||||
const tokens = parseQuery('"this AND that OR these NOT there || () && ^123"');
|
||||
const [quotedTermToken] = tokens;
|
||||
|
||||
expect(tokens).toHaveLength(1);
|
||||
|
||||
expect(quotedTermToken instanceof QuotedTermToken && quotedTermToken.decodedValue || null)
|
||||
.toBe('this AND that OR these NOT there || () && ^123');
|
||||
});
|
||||
|
||||
describe('QuotedTermToken', () => {
|
||||
it('should decode and encode quotes and backslash', () => {
|
||||
const encodedQuote = `"term with \\\" inside of it"`;
|
||||
const decodedQuote = 'term with " inside of it';
|
||||
|
||||
expect(QuotedTermToken.decode(encodedQuote)).toBe(decodedQuote);
|
||||
expect(QuotedTermToken.encode(decodedQuote)).toBe(encodedQuote);
|
||||
|
||||
const encodedBackslash = `"term with \\\\ inside of it"`;
|
||||
const decodedBackslash = 'term with \\ inside of it';
|
||||
|
||||
expect(QuotedTermToken.decode(encodedBackslash)).toBe(decodedBackslash);
|
||||
expect(QuotedTermToken.encode(decodedBackslash)).toBe(encodedBackslash);
|
||||
});
|
||||
|
||||
it('should not care for anything else', () => {
|
||||
const encodedTerm = '"operators: , && || AND OR NOT ! ^ ? *"';
|
||||
const decodedTerm = 'operators: , && || AND OR NOT ! ^ ? *';
|
||||
|
||||
expect(QuotedTermToken.decode(encodedTerm)).toBe(decodedTerm);
|
||||
expect(QuotedTermToken.encode(decodedTerm)).toBe(encodedTerm);
|
||||
});
|
||||
});
|
||||
});
|
||||
76
tests/lib/philomena/tag-utils.spec.ts
Normal file
76
tests/lib/philomena/tag-utils.spec.ts
Normal file
@@ -0,0 +1,76 @@
|
||||
import { URL } from 'url';
|
||||
import { resolveTagNameFromLink, slugEncodedCharacters } from '$lib/philomena/tag-utils';
|
||||
|
||||
describe('tag-utils', () => {
|
||||
const origin = 'https://furbooru.org';
|
||||
|
||||
describe('resolveTagNameFromLink', () => {
|
||||
function resolveFromSearchQuery(encodedQuery: string): string | null {
|
||||
return resolveTagNameFromLink(new URL(`/search?q=${encodedQuery}`, origin));
|
||||
}
|
||||
|
||||
describe('Parsing from /search/?q=tag links', () => {
|
||||
it('should resolve a single tag from /search URLs', () => {
|
||||
expect(resolveFromSearchQuery('safe')).toBe('safe');
|
||||
});
|
||||
|
||||
it('should return null for queries with multiple comma-separated tags', () => {
|
||||
// Comma acts as a separator in the query, resulting in multiple tokens
|
||||
expect(resolveFromSearchQuery('safe, suggestive')).toBe(null);
|
||||
});
|
||||
|
||||
it('should return null if query is empty or not a term', () => {
|
||||
expect(resolveFromSearchQuery('')).toBe(null);
|
||||
expect(resolveFromSearchQuery('!')).toBe(null);
|
||||
});
|
||||
|
||||
it('should properly treat parentheses in the query with single tag', () => {
|
||||
// Parentheses are operators in the query language, but when inside the tag name, they should still be properly
|
||||
// working.
|
||||
expect(resolveFromSearchQuery('experiment (casualties unknown)')).toBe('experiment (casualties unknown)');
|
||||
});
|
||||
|
||||
it('should properly resolve queries with encoded characters', () => {
|
||||
expect(resolveFromSearchQuery('pok%C3%A9mon')).toBe('pokémon');
|
||||
});
|
||||
|
||||
it('should unquote quoted term', () => {
|
||||
expect(resolveFromSearchQuery('"experiment (casualties unknown)"')).toBe('experiment (casualties unknown)')
|
||||
expect(resolveFromSearchQuery('"single tag, really"')).toBe('single tag, really');
|
||||
});
|
||||
})
|
||||
|
||||
describe('Parsing from /tags/name links', () => {
|
||||
function resolveFromTagLink(encodedTagName: string): string | null {
|
||||
return resolveTagNameFromLink(new URL(`/tags/${encodedTagName}`, origin));
|
||||
}
|
||||
|
||||
it('should resolve a single tag', () => {
|
||||
expect(resolveFromTagLink('safe')).toBe('safe');
|
||||
});
|
||||
|
||||
it('should only read the tag page even if query is provided', () => {
|
||||
expect(resolveFromTagLink('grotesque?q=explicit')).toBe('grotesque');
|
||||
});
|
||||
|
||||
it('should properly resolve links with encoded characters', () => {
|
||||
expect(resolveFromTagLink('pok%C3%A9mon')).toBe('pokémon');
|
||||
});
|
||||
|
||||
it('should decoded slug-encoded characters', () => {
|
||||
// More common example where tag is.
|
||||
expect(resolveFromTagLink(`namespace-colon-tag+name`)).toBe('namespace:tag name');
|
||||
|
||||
// Testing the whole list of encoded characters.
|
||||
for (const [encodedCharacter, decodedCharacter] of slugEncodedCharacters.entries()) {
|
||||
expect(resolveFromTagLink(`test+symbol${encodedCharacter}without+spaces`)).toBe(`test symbol${decodedCharacter}without spaces`);
|
||||
expect(resolveFromTagLink(`test+symbol+${encodedCharacter}+with+spaces`)).toBe(`test symbol ${decodedCharacter} with spaces`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
it('should return null for unsupported URLs', () => {
|
||||
expect(resolveTagNameFromLink(new URL('/pages/example', origin))).toBe(null);
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user