From 0431079e517b2fa6292ce289e3a6e25a6493a855 Mon Sep 17 00:00:00 2001 From: SavagePeanut Date: Sun, 3 Sep 2023 15:10:36 -0500 Subject: start telegram support --- src/lib.rs | 108 ++++++++++++++++-- tests/test_style_parser.py | 276 --------------------------------------------- 2 files changed, 101 insertions(+), 283 deletions(-) delete mode 100644 tests/test_style_parser.py diff --git a/src/lib.rs b/src/lib.rs index 8de599a..d868dda 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::cmp::Ordering; use pyo3::prelude::*; @@ -6,6 +7,14 @@ const KEYWORDS: [char; 4] = ['*', '_', '~', '`']; const NO_SUB_PARSING_KEYWORDS: [char; 1] = ['`']; const QUOTE_KEYWORDS: [char; 1] = ['>']; const PLACEHOLDER: &str = "\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}"; +const TELEGRAM_STYLES: &[(&'static str, &'static str)] = &[ + ("_", "italics"), + ("*", "bold"), + ("~", "strikethrough"), + ("||", "spoiler"), + ("`", "code"), + ("```", "pre") +]; #[pyfunction] fn format_body(body: String, new_tags: HashMap) -> PyResult { @@ -16,9 +25,8 @@ fn format_body(body: String, new_tags: HashMap) -> PyR let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0); let parse_quotes = new_tags.contains_key(&">".to_string()); - let mut tags: Vec<(usize, String, usize)> = vec![]; - for style in styles { - let (keyword, start, remove_start, end, remove_end) = style; + let mut tags: Vec<(usize, String, usize)> = Vec::with_capacity(styles.len() * 2); + for (keyword, start, remove_start, end, remove_end) in styles { if new_tags.contains_key(&keyword) { let opening_tag = if keyword == "```language" { new_tags.get(&keyword).unwrap().0.clone() @@ -30,15 +38,14 @@ fn format_body(body: String, new_tags: HashMap) -> PyR }; tags.push((start, opening_tag, remove_start)); tags.push((end, new_tags.get(&keyword).unwrap().1.clone(), remove_end)); - } else if keyword == ">>" && parse_quotes { + } else if (keyword == ">>" && parse_quotes) || keyword == "```>" { tags.push((start, "".to_string(), start+1)); } } tags.sort_by(|a, b| b.0.cmp(&a.0)); - for tag in tags { - let (index, tag, end) = tag; + for (index, tag, end) in tags { chars = [chars[..index].to_vec(), tag.chars().collect(), chars[end..].to_vec()].concat(); } @@ -51,6 +58,92 @@ fn format_body(body: String, new_tags: HashMap) -> PyR Ok(remove_non_escaped_backslashes(text)) } +#[pyfunction] +fn parse_for_telegram(body: String) -> PyResult<(String, Vec<(String, usize, usize, String)>)> { + let mut chars: Vec = body.chars().collect(); + if chars.len() < 1 { + return Ok((body, Vec::with_capacity(0))); + } + + let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0); + let mut remove_tags: Vec<(usize, usize)> = Vec::with_capacity(styles.len() * 2); + for (keyword, start, remove_start, end, remove_end) in &styles { + if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) { + remove_tags.push((*start, *remove_start)); + remove_tags.push((*end, *remove_end)); + } else if keyword == "```>" { + remove_tags.push((*start, *remove_start)); + } + } + + remove_tags.sort_by(|a, b| b.0.cmp(&a.0)); + + for (index, end) in remove_tags { + chars = [chars[..index].to_vec(), chars[end..].to_vec()].concat(); + } + + let mut message_entities: Vec<(bool, usize, String, usize, String)> = Vec::with_capacity(styles.len() * 2); + let mut all_indexes: Vec> = Vec::with_capacity(styles.len()); + for (keyword, start, remove_start, end, remove_end) in &styles { + if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) { + let language = if keyword == "```language" { + chars[start+3..remove_start-1] + .into_iter() + .collect::() + } else { + "".to_string() + }; + all_indexes.push(vec![*start, *remove_start, *end, *remove_end]); + let last_index = all_indexes.len() - 1; + message_entities.push((true, last_index, TELEGRAM_STYLES.iter().find(|&&(k, _)| k == keyword).unwrap().1.to_string(), *start, language)); + message_entities.push((false, last_index, "".to_string(), *end, "".to_string())); + } else if keyword == "```>" { + all_indexes.push(vec![0, 0, *start, 1]); + message_entities.push((false, all_indexes.len() - 1, "".to_string(), *start, "".to_string())); + } + } + message_entities.sort_by(sort_message_entities); + + let formatted_text = chars.into_iter().collect::(); + let utf16_lengths: Vec = utf8_to_utf16_length(&formatted_text); + + let mut offset = 0; + for (is_start, index, _, _, _) in &message_entities { + let indexes = &mut all_indexes[*index]; + if *is_start { + indexes[0] -= offset; + offset += indexes[1]; + } else { + indexes[2] -= offset; + offset += indexes[3]; + } + } + Ok(( + formatted_text, + message_entities.into_iter() + .filter(|(is_start, _, _, _, _)| { *is_start } ) + .map(|(_, index, format, _, language)| { (format, utf16_lengths[all_indexes[index][0]], utf16_lengths[all_indexes[index][2]], language) }) + .collect() + )) +} + +fn sort_message_entities(first: &(bool, usize, String, usize, String), second: &(bool, usize, String, usize, String)) -> Ordering { + return first.3.cmp(&second.3); +} + +fn utf8_to_utf16_length(utf8_str: &str) -> Vec { + let mut utf16_lengths = Vec::with_capacity(utf8_str.len()); + + let mut length = 0; + for byte in utf8_str.as_bytes() { + if (byte & 0xc0) != 0x80 { + length += if *byte >= 0xf0 { 2 } else { 1 }; + } + utf16_lengths.push(length); + } + utf16_lengths +} + fn remove_non_escaped_backslashes(text: String) -> String { let tmp_string = text.replace("\\\\", PLACEHOLDER); let tmp_string = tmp_string.replace("\\", ""); @@ -172,7 +265,7 @@ fn parse_quotes_in_code_block(chars: &Vec, start: usize, end: usize, depth let c = chars[index]; if QUOTE_KEYWORDS.contains(&c) { if is_nested_quote(chars, index, depth) { - quotes.push((">>".to_string(), index, index + 1, index + 1, index + 1)); + quotes.push(("```>".to_string(), index, index + 1, index + 1, index + 1)); } index += 1; continue; @@ -331,5 +424,6 @@ fn preceeded_by_backslash(chars: &Vec, index: usize, start: usize) -> bool #[pymodule] fn slidge_style_parser(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(format_body, m)?)?; + m.add_function(wrap_pyfunction!(parse_for_telegram, m)?)?; Ok(()) } diff --git a/tests/test_style_parser.py b/tests/test_style_parser.py deleted file mode 100644 index 8f39580..0000000 --- a/tests/test_style_parser.py +++ /dev/null @@ -1,276 +0,0 @@ -from slidge_style_parser import format_body - -MATRIX_FORMATS = { - "_": ("", ""), - "*": ("", ""), - "~": ("", ""), - "`": ("", ""), - "```": ("
", "
"), - "```language": ("
", "
"), - ">": ("
", "
"), - "||": ("", ""), - "\n": ("
", "") -} - -def test_basic(): - test = "_underline_" - formatted_body = "underline" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "*bold*" - formatted_body = "bold" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "~strikethrough~" - formatted_body = "strikethrough" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "`code span`" - formatted_body = "code span" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = """ - ```python - def test_basic(): - test = "_underline_" - formatted_body = "underline" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - ``` - """ - formatted_body = test = """
def test_basic():
test = "_underline_"
formatted_body = "underline"
assert(format_body(test, MATRIX_FORMATS) == (test, formatted_body))

""" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "```\ncode block\n```" - formatted_body = "
code block
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "||this message contains a spoiler||" - formatted_body = "this message contains a spoiler" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - -def test_quotes(): - test = ">single" - formatted_body = "
single
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">single arrow ->" - formatted_body = "
single arrow ->
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">single\n>grouped" - formatted_body = "
single
grouped
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>double" - formatted_body = "
double
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>double\n>>double" - formatted_body = "
double
double
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>double\n&>not quote" - formatted_body = "
double

&>not quote" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>double\n>grouped single" - formatted_body = "
double

grouped single
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>>tripple\n>single\n>>double" - formatted_body = "
tripple

single
double
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - -def test_code_blocks(): - test = "```\nhacker\ncode\n```" - formatted_body = "
hacker
code
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "```python\nhacker code\n```" - formatted_body = "
hacker code
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "```python\nhacker code\n```\nnormal text" - formatted_body = "
hacker code

normal text" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">```java\n>why are you quoting a code block\n>```" - formatted_body = "
why are you quoting a code block
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>```\n>>double quote code block\n>single quote not in code block\nnormal text" - formatted_body = "
double quote code block

single quote not in code block

normal text" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">```\n>please stop trying to break my parser ;-;" - formatted_body = "
please stop trying to break my parser ;-;
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>```\n>>>>double quote code block\n>single quote not in code block\nnormal text" - formatted_body = "
>>double quote code block

single quote not in code block

normal text" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "_```_ignored\ninvalid code block\n```" - formatted_body = "```ignored
invalid code block
```" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - -def test_escaped(): - test = "\\_no underline_" - formatted_body = "_no underline_" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "\\\\_no underline_" - formatted_body = "\\_no underline_" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">>>tripple\n\\>none\n>>double" - formatted_body = "
tripple

>none
double
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - -def test_nested(): - test = "`*~_code span_~*`" - formatted_body = "*~_code span_~*" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "*_~`code span`~_*" - formatted_body = "code span" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">*_~`code span`~_*" - formatted_body = "
code span
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "*bold star >*< star bold*" - formatted_body = "bold star >*< star bold" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "*_bold*_" - formatted_body = "_bold_" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "__underlined__" - formatted_body = "underlined" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - -def test_no_changes(): - test = "" - formatted_body = "" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "~~ empty `````` styles **" - formatted_body = "~~ empty `````` styles **" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "this is not an empty string" - formatted_body = "this is not an empty string" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "arrow ->" - formatted_body = "arrow ->" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = " > no quote" - formatted_body = " > no quote" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "_not underlined" - formatted_body = "_not underlined" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "|not a spoiler|" - formatted_body = "|not a spoiler|" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "||\nalso\nnot\na\nspoiler||" - formatted_body = "||
also
not
a
spoiler||" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "`no code\nblock here`" - formatted_body = "`no code
block here`" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "invalid ```\ncode block\n```" - formatted_body = "invalid ```
code block
```" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "```\ncode block\ninvalid```" - formatted_body = "```
code block
invalid```" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "```\ncode block\n```invalid" - formatted_body = "```
code block
```invalid" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - -def test_assorted(): - test = "\n" - formatted_body = "
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "at the ||end||" - formatted_body = "at the end" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "in the ~middle~ here" - formatted_body = "in the middle here" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "_underline_ *bold* ~strikethrough~ >not quote ||spoiler||\n>quote\nnothing\nnothing\n>>>>another quote with ||~_*```four```*_~||" - formatted_body = "underline bold strikethrough >not quote spoiler
quote

nothing
nothing
another quote with ```four```
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">```\n>do be do be dooo ba do be do be do ba\n>>>" - formatted_body = "
do be do be dooo ba do be do be do ba
>>
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "\n\n>```\n>do be do be dooo ba do be do be do ba\na\n\n\naoeu\n" - formatted_body = "

do be do be dooo ba do be do be do ba

a


aoeu
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">```\n>do be do be dooo ba do be do be do ba\n>\n>\n>aoeu" - formatted_body = "
do be do be dooo ba do be do be do ba


aoeu
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">```\n>code block\n>```invalid end\n" - formatted_body = "
code block
```invalid end

" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "invalid ```\ncode block\n*bold*\n```" - formatted_body = "invalid ```
code block
bold
```" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - -def test_weird_utf8(): - test = "โค๏ธ๐Ÿ’“๐Ÿ’•๐Ÿ’–๐Ÿ’— ||๐Ÿ’™๐Ÿ’š๐Ÿ’›๐Ÿ’œ๐Ÿ–ค|| ๐Ÿ’๐Ÿ’ž๐Ÿ’Ÿโฃ๏ธ" - formatted_body = "โค๏ธ๐Ÿ’“๐Ÿ’•๐Ÿ’–๐Ÿ’— ๐Ÿ’™๐Ÿ’š๐Ÿ’›๐Ÿ’œ๐Ÿ–ค ๐Ÿ’๐Ÿ’ž๐Ÿ’Ÿโฃ๏ธ" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง _underline_๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ง" - formatted_body = "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง underline๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ง" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "\u202eRight to left" - formatted_body = "\u202eRight to left" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = ">\u202eRight to left quote?" - formatted_body = "
\u202eRight to left quote?
" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "_Invisible\u200bseparator_" - formatted_body = "Invisible\u200bseparator" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - - test = "~\u200b~" - formatted_body = "\u200b" - assert(format_body(test, MATRIX_FORMATS) == formatted_body) - -LIMITED_FORMATS = { - "_": ("", ""), - "~": ("", ""), - "`": ("", "") -} - -def test_limited(): - test = "_underline_ *bold* ~strikethrough~ >not quote ||spoiler||\n>quote\nnothing\nnothing\n>>>>another quote with ||~_*```four```*_~||" - formatted_body = "underline *bold* strikethrough >not quote ||spoiler||\n>quote\nnothing\nnothing\n>>>>another quote with ||*```four```*||" - assert(format_body(test, LIMITED_FORMATS) == formatted_body) -- cgit v1.2.3