Implement HTML-code unescaping when parsing URLs

main
Adrian 11 months ago
parent fb99ca730f
commit 3dac4e4c68
  1. 3
      Cargo.toml
  2. 4
      src/converter/match_page.rs
  3. 15
      src/converter/results.rs
  4. 2
      src/converter/team_page.rs
  5. 4
      src/converter/upcoming.rs
  6. 7
      src/tl_extensions.rs
  7. 4
      tests/match_page.rs

@ -1,6 +1,6 @@
[package]
name = "hltv"
version = "0.3.1"
version = "0.3.2"
edition = "2021"
license = "MIT OR Apache-2.0"
description = "A client to fetch and parse data from HLTV.org"
@ -18,6 +18,7 @@ doctest = false
reqwest = "0.11.9"
chrono = "0.4.19"
tl = "0.6.2"
html-escape = "0.2.11"
[dev-dependencies]
tokio = { version = "1.16.1", features = ["macros", "time"] }

@ -67,8 +67,8 @@ fn get_team(root: RichNode, class: &str) -> Option<Team> {
.parse()
.ok()?,
name: t.find("teamName").inner_text()?,
logo: t.find("logo").get_attr_str("src")?,
alt_logo: t.find("night-only").get_attr_str("src"),
logo: t.find("logo").get_attr_str_esc("src")?,
alt_logo: t.find("night-only").get_attr_str_esc("src"),
})
}

@ -1,5 +1,3 @@
use tl::queryselector::QuerySelectorIterator;
use crate::data::*;
use crate::tl_extensions::*;
use crate::ConvertInstance;
@ -8,7 +6,11 @@ use crate::{Error, Error::ConversionError};
impl ConvertInstance for Vec<MatchResult> {
fn convert<'a>(d: &'a tl::VDom<'a>) -> Result<Vec<MatchResult>, Error> {
let mut result = Vec::<MatchResult>::new();
let match_containers = d.query_selector("div.results-all").unwrap().next().unwrap();
let match_containers = d
.query_selector("div.results-all")
.unwrap()
.next()
.ok_or(ConversionError("no div.results-all container found"))?;
for h in match_containers.to_rich(d).find_all("result-con") {
result.push(MatchResult {
id: parse_id(h)?,
@ -24,12 +26,6 @@ impl ConvertInstance for Vec<MatchResult> {
}
}
/// Returns the an iterator over roots of interest (i.e. the containers of
/// results).
fn get_roots<'a>(d: &'a tl::VDom<'a>) -> QuerySelectorIterator<tl::VDom> {
d.query_selector("div.result-con").unwrap()
}
fn parse_format(h: RichNode) -> Result<MatchFormat, Error> {
match h
.find("map-text")
@ -137,5 +133,4 @@ mod tests {
}
);
}
}

@ -33,7 +33,7 @@ fn get_root(d: &tl::VDom) -> Result<NodeHandle, Error> {
fn get_logo(h: RichNode) -> Result<String, Error> {
h.find("profile-team-logo-container")
.find("teamlogo")
.get_attr_str("src")
.get_attr_str_esc("src")
.ok_or(ConversionError("couldn't find logo container or logo"))
}

@ -34,8 +34,8 @@ fn parse_team(h: RichNode, team_id: &str) -> Option<Team> {
Some(Team {
id: h.get_attr(team_id).unwrap_or(None)?,
name: t.find("matchTeamName").inner_text()?,
logo: t.find("matchTeamLogo").get_attr_str("src")?,
alt_logo: t.find("night-only").get_attr_str("src"),
logo: t.find("matchTeamLogo").get_attr_str_esc("src")?,
alt_logo: t.find("night-only").get_attr_str_esc("src"),
})
}

@ -145,6 +145,13 @@ impl<'a> RichNode<'a> {
Some(result.as_utf8_str().to_string())
}
/// Get attribute as unescaped HTML string
pub fn get_attr_str_esc(&self, attr: &str) -> Option<String> {
let result = self.get_attr_str(attr)?;
let result = html_escape::decode_html_entities(&result);
Some(result.to_string())
}
pub fn get_attr<T>(&self, attr: &str) -> Result<Option<T>, Error>
where
T: FromStr,

@ -56,8 +56,8 @@ async fn concluded_bo3() -> Result<(), Box<dyn Error>> {
MatchPage {
id: 2346065,
status: MatchStatus::Finished,
team1: Some(Team::new(6665, "Astralis", "https://img-cdn.hltv.org/teamlogo/9bgXHp-oh1oaXr7F0mTGmd.svg?ixlib=java-2.1.0&amp;s=f567161ab183001be33948b98c4b2067", None)),
team2: Some(Team::new(9565, "Vitality", "https://img-cdn.hltv.org/teamlogo/GAlByJtDTnkgbb9p_71SUL.png?ixlib=java-2.1.0&amp;w=100&amp;s=ddc5952ae492cbefb10fbe64471486b5", None)),
team1: Some(Team::new(6665, "Astralis", "https://img-cdn.hltv.org/teamlogo/9bgXHp-oh1oaXr7F0mTGmd.svg?ixlib=java-2.1.0&s=f567161ab183001be33948b98c4b2067", None)),
team2: Some(Team::new(9565, "Vitality", "https://img-cdn.hltv.org/teamlogo/GAlByJtDTnkgbb9p_71SUL.png?ixlib=java-2.1.0&w=100&s=ddc5952ae492cbefb10fbe64471486b5", None)),
event: Event {
id: 5206,
name: "BLAST Premier Global Final 2020".to_string()

Loading…
Cancel
Save