1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
use std::cmp::Ordering;
use pyo3::prelude::*;
use crate::parser::parse_with_limits;
const TELEGRAM_STYLES: &[(&'static str, &'static str)] = &[
("_", "italics"),
("*", "bold"),
("~", "strikethrough"),
("||", "spoiler"),
("`", "code"),
("```language", "pre"),
("```", "pre")
];
#[pyfunction]
pub fn parse_for_telegram(body: String) -> PyResult<(String, Vec<(String, usize, usize, String)>)> {
let mut chars: Vec<char> = body.chars().collect();
if chars.len() < 1 {
return Ok((body, Vec::with_capacity(0)));
}
let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0);
let mut remove_tags: Vec<(usize, usize)> = Vec::with_capacity(styles.len() * 2);
for (keyword, start, remove_start, end, remove_end) in &styles {
if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
remove_tags.push((*start, *remove_start));
remove_tags.push((*end, *remove_end));
} else if keyword == "```>" || keyword == "\\" {
remove_tags.push((*start, *remove_start));
}
}
// is_start (*<--- start, end -->*), index of all_indexes, format, index of tag, language of codeblock
let mut message_entities: Vec<(bool, usize, String, usize, String)> = Vec::with_capacity(styles.len() * 2);
let mut all_indexes: Vec<Vec<usize>> = Vec::with_capacity(styles.len());
for (keyword, start, remove_start, end, remove_end) in &styles {
if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
let language = if keyword == "```language" {
chars[start+3..remove_start-1]
.into_iter()
.collect::<String>()
} else {
"".to_string()
};
all_indexes.push(vec![*start, *remove_start - *start, *end, *remove_end - *end]);
let last_index = all_indexes.len() - 1;
message_entities.push((true, last_index, TELEGRAM_STYLES.iter().find(|&&(k, _)| k == keyword).unwrap().1.to_string(), *start, language));
message_entities.push((false, last_index, "".to_string(), *end, "".to_string()));
} else if keyword == "```>" || keyword == "\\" {
all_indexes.push(vec![0, 0, *start, 1]);
message_entities.push((false, all_indexes.len() - 1, "".to_string(), *start, "".to_string()));
}
}
message_entities.sort_by(sort_message_entities);
remove_tags.sort_by(|a, b| b.0.cmp(&a.0));
for (index, end) in remove_tags {
chars = [chars[..index].to_vec(), chars[end..].to_vec()].concat();
}
let formatted_text = chars.into_iter().collect::<String>();
let utf16_lengths: Vec<usize> = utf8_to_utf16_length(&formatted_text);
let mut offset = 0;
for (is_start, index, _, _, _) in &message_entities {
let indexes = &mut all_indexes[*index];
if *is_start {
indexes[0] -= offset;
offset += indexes[1];
} else {
indexes[2] -= offset;
offset += indexes[3];
}
}
Ok((
formatted_text,
message_entities.into_iter()
.filter(|(is_start, _, _, _, _)| { *is_start } )
.map(|(_, index, format, _, language)| { (format, utf16_lengths[all_indexes[index][0]], utf16_lengths[all_indexes[index][2] - 1] - utf16_lengths[all_indexes[index][0]], language) })
.collect()
))
}
fn sort_message_entities(first: &(bool, usize, String, usize, String), second: &(bool, usize, String, usize, String)) -> Ordering {
return first.3.cmp(&second.3);
}
fn utf8_to_utf16_length(utf8_str: &str) -> Vec<usize> {
let mut utf16_lengths = Vec::with_capacity(utf8_str.len());
let mut length = 0;
for byte in utf8_str.as_bytes() {
if (byte & 0xc0) != 0x80 {
length += if *byte >= 0xf0 { 2 } else { 1 };
}
utf16_lengths.push(length);
}
utf16_lengths
}
|