Browse Source

Добавлены загаловки тасс, исправлена очистка ХТМЛ, остаются теги таблиц и списков и много чего ещё. #5 #1

master
parent
commit
03ca3aadc9
  1. 82
      news_parser.js

82
news_parser.js

@ -113,26 +113,7 @@
for (var i in a_Elements) {
let e = a_Elements[i];
let content = '';
if (e.querySelectorAll) {
var children = e.querySelectorAll("*");
if (children.length == 0 || e.innerText) {
content += a_GrubTextFunc(e);
}
else {
for (let i = 0; i < children.length; i++) {
let c = children[i];
content += a_GrubTextFunc(c);
}
}
}
if (a_FinishWorkFunc) {
result += a_FinishWorkFunc(content, e);
}
else {
result += content;
}
result += a_FinishWorkFunc(e);
}
return result;
}
@ -183,8 +164,57 @@
return GrubTextFunc
}
function RemoveAllAttributes(a_Element) {
let new_el = document.createElement(a_Element.nodeName);
new_el.innerHTML = a_Element.innerHTML;
a_Element.outherHTML = new_el.outherHTML;
}
function RemoveCurrentElementSaveChild(a_Element, a_Parent) {
var parent = a_Element.parentNode || a_Parent;
while(a_Element.firstChild) parent.insertBefore(a_Element.firstChild, a_Element);
parent.removeChild(a_Element);
}
function GetClearHtml(a_Element, a_OutTag, a_TextAlign) {
let clear_element = a_Element.cloneNode(true);
let tags_to_delete = ['div', 'span', 'em', 'svg', 'path'];
tags_to_delete.forEach(function (del_tag_name) {
let elements = clear_element.querySelectorAll(del_tag_name);
elements.forEach(function (element) {
RemoveCurrentElementSaveChild(element, clear_element);
});
});
let elements = clear_element.querySelectorAll('*');
elements.forEach(function (element) {
RemoveAllAttributes(element);
});
elements = clear_element.querySelectorAll('a');
let re = new RegExp("(https?:\/\/.*)");
elements.forEach(function (element) {
if (!re.test(element.pathname)) {
RemoveCurrentElementSaveChild(element, clear_element);
}
});
let result = clear_element.innerHTML;
if (a_OutTag && a_TextAlign) {
result = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + result + '</' + a_OutTag + '>';
}
return result;
}
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) {
function FinishWorkFunc(a_Content, a_Element) {
function FinishWorkFunc(a_Element) {
let out_tag = a_OutTag;
if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) {
out_tag = 'h2';
}
let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign)
if (a_ClearTextFunc) {
a_Content = a_ClearTextFunc(a_Content);
}
@ -204,12 +234,9 @@
let c = childrens[i];
content += FinishWorkFunc(GrubTextFuncTemplate()(c), c);
}
return content
return content;
}
if (a_OutTag && a_TextAlign) {
a_Content = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + a_Content + '</' + a_OutTag + '>';
}
if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') {
a_Content = '<blockquote>' + a_Content + '</blockquote>';
}
@ -244,14 +271,15 @@
const grub_text_func = GrubTextFuncTemplate()
if (location.hostname == 'tass.ru') {
// test: https://tass.ru/proisshestviya/19117971
const base_element = document.getElementById('content_box');
content = MakeContentByNews(
base_element,
base_element,
base_element,
'tass_pkg_title--variant_h1_default.*',
'(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*',
'Image_wrapper_.*',
'Paragraph_paragraph.*',
'(Paragraph_paragraph|Title_title).*',
ElementCheckerTrue,
['/ТАСС/. ']
);

Loading…
Cancel
Save