// ==UserScript==
// @name News parser
// @namespace http://zakonvremeni.ru
// @version 0.2
// @description Parse news
// @author AlexeiBv+mirocod@narod.ru
// @match https://tass.ru/*
// @match https://ria.ru/*
// @match https://rg.ru/*
// @match https://russian.rt.com/*
// @match https://zakonvremeni.ru/*
// @icon https://icons.duckduckgo.com/ip2/zakonvremeni.ru.ico
// @grant none
// ==/UserScript==
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru>
( function ( ) {
'use strict' ;
// Поиск элементов по регулярному выражению
function GetElementClassName ( a _Element ) {
return a _Element . className ;
}
function GetNodeName ( a _Element ) {
return a _Element . nodeName ;
}
function CheckRegExp ( a _GetElementNameFunc , a _RegExpPattern , a _Element ) {
let re = new RegExp ( "(?:^|\\s)" + a _RegExpPattern + "(?!\\S)" ) ;
return re . test ( a _GetElementNameFunc ( a _Element ) ) ;
}
function FindElementsByRegExp ( a _GetElementNameFunc , a _RegExpPattern , a _ElementParent ) {
a _ElementParent || ( a _ElementParent = document ) ;
let descendants = a _ElementParent . getElementsByTagName ( '*' ) , i = - 1 , e , result = [ ] ;
while ( e = descendants [ ++ i ] ) {
if ( CheckRegExp ( a _GetElementNameFunc , a _RegExpPattern , e ) ) {
result . push ( e ) ;
}
}
return result ;
}
// Работа со строками
function TrimString ( s ) {
return ( s || '' ) . replace ( /^\s+|\s+$/g , '' ) ;
}
function RemoveBeforeSplitter ( a _String , a _Splitter ) {
let index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
return a _String . substring ( index + a _Splitter . length ) ;
}
return a _String ;
}
function RemoveAfterSplitter ( a _String , a _Splitter , a _SaveSplitter ) {
let index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
let spl _len = a _Splitter . length
if ( ! a _SaveSplitter ) {
spl _len = 0 ;
}
return a _String . substring ( 0 , index + spl _len ) ;
}
return a _String ;
}
function ClearUrl ( a _Url ) {
const separator = '?' ;
return RemoveAfterSplitter ( a _Url , separator , false ) ;
}
function ClearTextFuncTemplate ( a _RemoveBeforeList , a _OnlyFirstIndex ) {
function ClearTextFunc ( a _Content , a _ElementIndex ) {
if ( a _OnlyFirstIndex && a _ElementIndex && a _ElementIndex != 1 ) {
return a _Content ;
}
let content = a _Content ;
for ( let i = 0 ; i < a _RemoveBeforeList . length ; i ++ ) {
let r = a _RemoveBeforeList [ i ] ;
content = RemoveBeforeSplitter ( content , r ) ;
}
return content ;
}
return ClearTextFunc
}
// Работа с контейнерами
function GetImageInContainers ( a _Elements , a _TextAlign ) {
let i ;
let img _src = '' ;
let re = new RegExp ( "(https?:\/\/.*\.(?:png|jpg))" ) ;
for ( i in a _Elements ) {
let e = a _Elements [ i ] ;
let children = e . querySelectorAll ( "*" ) ;
for ( let i = 0 ; i < children . length ; i ++ ) {
var c = children [ i ] ;
if ( c . nodeName == 'IMG' && re . test ( c . src ) ) {
img _src = c . src ;
}
}
}
if ( img _src . length > 0 ) {
return '<p style = "text-align:' + a _TextAlign + ';"><img src = "' + img _src + '" width = "600px"/></p>' ;
}
return '' ;
}
function GetContentInContainers ( a _Elements , a _GrubTextFunc , a _FinishWorkFunc ) {
let result = '' ;
let element _index = 1 ;
for ( var i in a _Elements ) {
let e = a _Elements [ i ] ;
result += a _FinishWorkFunc ( e , element _index ) ;
element _index += 1 ;
}
return result ;
}
// Фильтрация элементов
function FIlterElements ( a _Elements , a _ElementChecker ) {
let result = [ ] ;
for ( let i = 0 ; i < a _Elements . length ; i ++ ) {
let e = a _Elements [ i ] ;
if ( a _ElementChecker ( e ) ) {
result . push ( e ) ;
}
}
return result ;
}
function ElementCheckerTrue ( a _Element ) {
return true ;
}
function ElementCheckerRia ( a _Element ) {
if ( a _Element . dataset . type == 'article' || a _Element . dataset . type == 'banner' || a _Element . dataset . type == 'media' ) {
return false ;
}
return true ;
}
function ElementCheckerZV ( a _Element ) {
if ( a _Element . itemprop == 'articleBody' ) {
return true ;
}
return false ;
}
// Обработка элементов
function GrubTextFuncTemplate ( ) {
function GrubTextFunc ( a _Element ) {
var content = '' ;
if ( a _Element . innerText ) {
content = TrimString ( a _Element . textContent ) ;
}
return content ;
}
return GrubTextFunc
}
function RemoveAllAttributes ( a _Element ) {
let new _el = document . createElement ( a _Element . nodeName ) ;
new _el . innerHTML = a _Element . innerHTML ;
a _Element . outherHTML = new _el . outherHTML ;
}
function RemoveCurrentElementSaveChild ( a _Element , a _Parent ) {
var parent = a _Element . parentNode || a _Parent ;
while ( a _Element . firstChild ) parent . insertBefore ( a _Element . firstChild , a _Element ) ;
parent . removeChild ( a _Element ) ;
}
function GetClearHtml ( a _Element , a _OutTag , a _TextAlign ) {
let clear _element = a _Element . cloneNode ( true ) ;
let elements = clear _element . querySelectorAll ( '*' ) ;
elements . forEach ( function ( element ) {
let for _rt _com = 'read-more|article__cover' ;
let for _rg _com = 'portal|rg-incut|article-img|Section' ;
let delete _in _rg = element . nodeName == 'RG-VIDEO' || element . nodeName == 'RG-INCUT' ;
if ( element && ( element . parentNode && CheckRegExp ( GetElementClassName , '.*(' + for _rt _com + '|' + for _rg _com + ').*' , element ) || delete _in _rg ) ) {
element . parentNode . removeChild ( element ) ;
}
else {
RemoveAllAttributes ( element ) ;
element . removeAttribute ( 'class' ) ;
element . removeAttribute ( 'id' ) ;
}
} ) ;
let tags _to _delete = [ 'div' , 'span' , 'em' , 'svg' , 'path' ] ;
tags _to _delete . forEach ( function ( del _tag _name ) {
let elements = clear _element . querySelectorAll ( del _tag _name ) ;
elements . forEach ( function ( element ) {
RemoveCurrentElementSaveChild ( element , clear _element ) ;
} ) ;
} ) ;
elements = clear _element . querySelectorAll ( 'a' ) ;
elements . forEach ( function ( element ) {
if ( element . host == location . hostname ) {
RemoveCurrentElementSaveChild ( element , clear _element ) ;
}
} ) ;
let tags _to _align = [ 'p' , 'h2' ] ;
tags _to _align . forEach ( function ( align _tag _name ) {
let elements = clear _element . querySelectorAll ( align _tag _name ) ;
elements . forEach ( function ( element ) {
element . style . textAlign = a _TextAlign ;
if ( element . innerHTML == ' ' || element . innerHTML == '' ) {
element . parentNode . removeChild ( element ) ;
}
} ) ;
} ) ;
let result = clear _element . innerHTML ;
if ( a _OutTag && a _TextAlign ) {
result = '<' + a _OutTag + ' style = "text-align:' + a _TextAlign + ';">' + result + '</' + a _OutTag + '>' ;
}
return result ;
}
function FinishWorkFuncTemplate ( a _OutTag , a _TextAlign , a _ClearTextFunc ) {
function FinishWorkFunc ( a _Element , a _ElementIndex ) {
let out _tag = a _OutTag ;
if ( a _Element && CheckRegExp ( GetElementClassName , 'Title_title.*' , a _Element ) ) {
out _tag = 'h2' ;
}
let a _Content = GetClearHtml ( a _Element , out _tag , a _TextAlign )
if ( a _ClearTextFunc ) {
a _Content = a _ClearTextFunc ( a _Content , a _ElementIndex ) ;
}
if ( a _Element && a _Element . dataset && a _Element . dataset . type == 'quote' ) {
a _Content = '<blockquote>' + a _Content + '</blockquote>' ;
}
return a _Content ;
}
return FinishWorkFunc
}
// Создание контента для стандартных новостей
function MakeContentByNews ( a _BaseElementTitle , a _BaseElementImage , a _BaseElementText , a _TitleRegExpElementPattern , a _ImageRegExpElementPattern , a _TextRegExpElementPattern , a _ElementChecker , a _ClearTextPatterns ) {
const title _tag = 'h2' ;
const p _tag = 'p' ;
const title _finish _text _func = FinishWorkFuncTemplate ( title _tag , 'center' )
const grub _func = GrubTextFuncTemplate ( ) ;
let content = '' ;
const paragraph _finish _text _func = FinishWorkFuncTemplate ( p _tag , 'justify' , ClearTextFuncTemplate ( a _ClearTextPatterns , true ) ) ;
content += GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , a _TitleRegExpElementPattern , a _BaseElementTitle ) , grub _func , title _finish _text _func ) ;
content += GetImageInContainers ( FindElementsByRegExp ( GetElementClassName , a _ImageRegExpElementPattern , a _BaseElementImage ) , 'center' ) ;
content += GetContentInContainers ( FIlterElements ( FindElementsByRegExp ( GetElementClassName , a _TextRegExpElementPattern , a _BaseElementText ) , a _ElementChecker ) , grub _func , paragraph _finish _text _func ) ;
return content ;
}
// Создание контента для сайта
function MakeContent ( ) {
let content = '' ;
let source _add = true ;
const zero _tag _func = FinishWorkFuncTemplate ( )
const grub _text _func = GrubTextFuncTemplate ( )
if ( location . hostname == 'tass.ru' ) {
// test: https://tass.ru/proisshestviya/19117971
const base _element = document . getElementById ( 'content_box' ) ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element ,
'(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*' ,
'Image_wrapper_.*' ,
'(Paragraph_paragraph|Title_title).*' ,
ElementCheckerTrue ,
[ '. ' ]
) ;
}
else if ( location . hostname == 'ria.ru' ) {
// test: https://ria.ru/20231020/ssha-1904210900.html
const base _element = document . getElementsByClassName ( 'article__header' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
const tire = [ '-' , '–' , '—' , '‒' , '―' , '⸺' , '⸻' ] ;
let clear _text = [ ] ;
for ( let i in tire ) {
let t = tire [ i ] ;
clear _text . push ( t + ' РИА Новости. ' ) ;
}
content = MakeContentByNews (
base _element ,
base _element ,
base _element _text ,
'article__title' ,
'photoview__open' ,
'article__block' ,
ElementCheckerRia ,
[ '</strong>' ]
) ;
}
else if ( location . hostname == 'rg.ru' ) {
// test: https://rg.ru/2023/10/28/volontery-iz-evrosoiuza-privezli-dlia-zhitelej-donbassa-20-tonn-gumanitarnogo-gruza.html
const base _element = document . getElementsByClassName ( 'article__header' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
content = MakeContentByNews (
document ,
document ,
base _element _text ,
'PageArticleContent_title.*' ,
'PageArticleContent_image.*' ,
'(PageContentCommonStyling_text|PageArticleContent_lead).*' ,
ElementCheckerRia ,
[ ]
) ;
}
else if ( location . hostname == 'russian.rt.com' ) {
// test: https://russian.rt.com/business/article/1222163-centrobank-stavka-oktyabr-2023
const base _element = document . getElementsByClassName ( 'article article_article-page' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element _text ,
'article__heading' ,
'article__cover article__cover_article-page' ,
'article__text' ,
ElementCheckerRia ,
[ ]
) ;
}
else if ( location . hostname == 'zakonvremeni.ru' ) {
const base _element = document . getElementsByClassName ( 'item-page' ) [ 0 ] ;
const title = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'page-header' , base _element ) , grub _text _func ) ;
const parent _category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'parent-category-name' , base _element ) , grub _text _func ) ;
const category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'category-name' , base _element ) , grub _text _func ) ;
const page = RemoveAfterSplitter ( TrimString ( document . getElementsByClassName ( 'item-page' ) [ 0 ] . querySelector ( '[itemprop=articleBody]' ) . textContent ) , '.' , true ) ;
content = title + '\n' + parent _category + ' ' + category + '\n\n' + page + '\n' + document . URL ;
source _add = false ;
}
let result = '' ;
if ( content . length > 0 ) {
result = '<textarea id = "news_content" rows="10" cols="100">' + content ;
if ( source _add ) {
result += '<p style="text-align: justify;">Источник: <a href = "' + ClearUrl ( document . URL ) + '">' + location . hostname + '</a></p>' ;
}
result += '</textarea>' ;
}
return result ;
}
let content = MakeContent ( ) ;
let logo = document . createElement ( "div" ) ;
logo . innerHTML = '<div style="margin: 0pt auto; width: 800px; text-align: center;">' + content + '</div>' ;
document . body . insertBefore ( logo , document . body . firstChild ) ;
} ) ( ) ;