// ==UserScript==
// @name News parser
// @namespace http://zakonvremeni.ru
// @version 0.3.5
// @description Parse news
// @author AlexeiBv+mirocod@narod.ru
// @match https://tass.ru/*
// @match https://ria.ru/*
// @match https://rg.ru/*
// @match https://www.cnews.ru/*
// @match https://mixednews.ru/*
// @match https://russian.rt.com/*
// @match https://iz.ru/*
// @match https://zakonvremeni.ru/*
// @icon https://icons.duckduckgo.com/ip2/zakonvremeni.ru.ico
// @grant none
// ==/UserScript==
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru>
( function ( ) {
'use strict' ;
// Поиск элементов по регулярному выражению
function GetElementClassName ( a _Element ) {
return a _Element . className ;
}
function GetNodeName ( a _Element ) {
return a _Element . nodeName ;
}
function CheckRegExp ( a _GetElementNameFunc , a _RegExpPattern , a _Element ) {
let re = new RegExp ( "(?:^|\\s)" + a _RegExpPattern + "(?!\\S)" ) ;
return re . test ( a _GetElementNameFunc ( a _Element ) ) ;
}
function FindElementsByRegExp ( a _GetElementNameFunc , a _RegExpPattern , a _ElementParent ) {
a _ElementParent || ( a _ElementParent = document ) ;
let descendants = a _ElementParent . getElementsByTagName ( '*' ) , i = - 1 , e , result = [ ] ;
while ( e = descendants [ ++ i ] ) {
if ( CheckRegExp ( a _GetElementNameFunc , a _RegExpPattern , e ) ) {
result . push ( e ) ;
}
}
return result ;
}
// Работа со строками
function TrimString ( str ) {
return ( str || '' ) . replace ( /^\s+|\s+$/g , '' ) ;
}
function RemoveBeforeSplitter ( a _String , a _Splitter ) {
let index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
return a _String . substring ( index + a _Splitter . length ) ;
}
return a _String ;
}
function RemoveAfterSplitter ( a _String , a _Splitter , a _SaveSplitter ) {
let index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
let spl _len = a _Splitter . length
if ( ! a _SaveSplitter ) {
spl _len = 0 ;
}
return a _String . substring ( 0 , index + spl _len ) ;
}
return a _String ;
}
function ClearUrl ( a _Url ) {
const separator = '?' ;
return RemoveAfterSplitter ( a _Url , separator , false ) ;
}
function ClearWWW ( a _Url ) {
return ( a _Url || '' ) . replace ( 'www.' , '' ) ;
}
function ClearTextFuncTemplate ( a _RemoveBeforeList , a _OnlyFirstIndex ) {
function ClearTextFunc ( a _Content , a _ElementIndex ) {
if ( a _OnlyFirstIndex && a _ElementIndex && a _ElementIndex != 1 ) {
return a _Content ;
}
let content = a _Content ;
for ( let i = 0 ; i < a _RemoveBeforeList . length ; i ++ ) {
let r = a _RemoveBeforeList [ i ] ;
content = RemoveBeforeSplitter ( content , r ) ;
}
return content ;
}
return ClearTextFunc
}
// Работа с контейнерами
function GetImageInContainers ( a _Elements , a _TextAlign ) {
let i ;
let img _src = '' ;
let re = new RegExp ( "(https?:\/\/.*\.(?:png|jpg))" ) ;
for ( i in a _Elements ) {
let e = a _Elements [ i ] ;
if ( e . nodeName == 'IMG' && re . test ( e . src ) ) {
img _src = e . src ;
}
let children = e . querySelectorAll ( "*" ) ;
for ( let i = 0 ; i < children . length ; i ++ ) {
let c = children [ i ] ;
if ( c . nodeName == 'IMG' && re . test ( c . src ) ) {
img _src = c . src ;
}
}
if ( img _src . length > 0 ) {
break ;
}
}
if ( img _src . length > 0 ) {
return '<p style = "text-align:' + a _TextAlign + ';"><img src = "' + ClearUrl ( img _src ) + '" width = "600px"/></p>' ;
}
return '' ;
}
function GetContentInContainers ( a _Elements , a _GrubTextFunc , a _FinishWorkFunc ) {
let result = '' ;
let element _index = 1 ;
for ( var i in a _Elements ) {
let e = a _Elements [ i ] ;
result += a _FinishWorkFunc ( e , element _index ) ;
element _index += 1 ;
}
return result ;
}
// Фильтрация элементов
function FIlterElements ( a _Elements , a _ElementChecker ) {
let result = [ ] ;
for ( let i = 0 ; i < a _Elements . length ; i ++ ) {
let e = a _Elements [ i ] ;
if ( a _ElementChecker ( e ) ) {
result . push ( e ) ;
}
}
return result ;
}
function ElementCheckerTrue ( a _Element ) {
return true ;
}
function ElementCheckerFalse ( a _Element ) {
return false ;
}
function ElementCheckerRia ( a _Element ) {
if ( a _Element . dataset . type == 'article' || a _Element . dataset . type == 'banner' || a _Element . dataset . type == 'media' || a _Element . dataset . type == 'video' || a _Element . dataset . type == 'photolenta' ) {
return false ;
}
return true ;
}
function ElementCheckerZV ( a _Element ) {
if ( a _Element . itemprop == 'articleBody' ) {
return true ;
}
return false ;
}
function SubElementCheckerToRemoveTemplate ( a _Classes , a _NodeNames ) {
function SubElementCheckerToRemove ( a _Element ) {
if ( ! a _Element || ! a _Element . parentNode ) {
return false ;
}
if ( CheckRegExp ( GetElementClassName , '.*(' + a _Classes + ').*' , a _Element ) ) {
return true ;
}
if ( ( a _NodeNames && CheckRegExp ( GetNodeName , '(' + a _NodeNames + ')' , a _Element ) ) ) {
return true ;
}
return false ;
}
return SubElementCheckerToRemove
}
// Обработка элементов
function GrubTextFuncTemplate ( ) {
function GrubTextFunc ( a _Element ) {
var content = '' ;
if ( a _Element . innerText ) {
content = TrimString ( a _Element . textContent ) ;
}
return content ;
}
return GrubTextFunc
}
function RemoveAllAttributes ( a _Element ) {
let new _el = document . createElement ( a _Element . nodeName ) ;
new _el . innerHTML = a _Element . innerHTML ;
a _Element . outherHTML = new _el . outherHTML ;
}
function RemoveCurrentElementSaveChild ( a _Element , a _Parent ) {
var parent = a _Element . parentNode || a _Parent ;
while ( a _Element . firstChild ) parent . insertBefore ( a _Element . firstChild , a _Element ) ;
parent . removeChild ( a _Element ) ;
}
function RemoveCommentsHTML ( a _String ) {
return ( a _String || '' ) . replace ( /(<!--.*?-->)|(<!--[\S\s]+?-->)|(<!--[\S\s]*?$)/g , '' ) ;
}
function GetClearHtml ( a _Element , a _OutTag , a _TextAlign , a _SubElementCheckerToRemove , a _ClearTextFunc , a _ElementIndex ) {
let clear _element = a _Element . cloneNode ( true ) ;
let elements = clear _element . querySelectorAll ( '*' ) ;
elements . forEach ( function ( element ) {
if ( a _SubElementCheckerToRemove && a _SubElementCheckerToRemove ( element ) ) {
element . parentNode . removeChild ( element ) ;
}
else {
RemoveAllAttributes ( element ) ;
element . removeAttribute ( 'class' ) ;
element . removeAttribute ( 'id' ) ;
}
} ) ;
let tags _to _delete = [ 'div' , 'span' , 'em' , 'svg' , 'path' , 'u' ] ;
tags _to _delete . forEach ( function ( del _tag _name ) {
let elements = clear _element . querySelectorAll ( del _tag _name ) ;
elements . forEach ( function ( element ) {
RemoveCurrentElementSaveChild ( element , clear _element ) ;
} ) ;
} ) ;
elements = clear _element . querySelectorAll ( 'a' ) ;
elements . forEach ( function ( element ) {
if ( element . host == location . hostname ) {
RemoveCurrentElementSaveChild ( element , clear _element ) ;
}
} ) ;
let tags _to _align = [ 'p' , 'h2' , 'li' ] ;
tags _to _align . forEach ( function ( align _tag _name ) {
let elements = clear _element . querySelectorAll ( align _tag _name ) ;
elements . forEach ( function ( element ) {
element . style . textAlign = a _TextAlign ;
if ( TrimString ( element . innerHTML ) . replace ( ' ' , '' ) == '' ) {
element . parentNode . removeChild ( element ) ;
}
} ) ;
} ) ;
let result = TrimString ( RemoveCommentsHTML ( clear _element . innerHTML ) ) ;
if ( a _ClearTextFunc ) {
result = a _ClearTextFunc ( result , a _ElementIndex ) ;
}
if ( a _OutTag && a _TextAlign && TrimString ( result ) . replace ( ' ' , '' ) != '' ) {
result = '<' + a _OutTag + ' style = "text-align:' + a _TextAlign + ';">' + result + '</' + a _OutTag + '>' ;
}
return result ;
}
function FinishWorkFuncTemplate ( a _OutTag , a _TextAlign , a _ClearTextFunc , a _SubElementCheckerToRemove ) {
function FinishWorkFunc ( a _Element , a _ElementIndex ) {
let out _tag = a _OutTag ;
if ( a _Element && CheckRegExp ( GetElementClassName , 'Title_title.*' , a _Element ) ) {
out _tag = 'h2' ;
}
let a _Content = GetClearHtml ( a _Element , out _tag , a _TextAlign , a _SubElementCheckerToRemove , a _ClearTextFunc , a _ElementIndex ) ;
if ( a _Element && a _Element . dataset && a _Element . dataset . type == 'quote' ) {
a _Content = '<blockquote>' + a _Content + '</blockquote>' ;
}
return a _Content ;
}
return FinishWorkFunc
}
function FinishWorkFuncZV ( a _Element , a _ElementIndex ) {
return GrubTextFuncTemplate ( ) ( a _Element ) ;
}
// Создание контента для стандартных новостей
const title _tag = 'h2' ;
const title _finish _text _func = FinishWorkFuncTemplate ( title _tag , 'center' ) ;
function MakeContentByNews ( a _BaseElementTitle , a _BaseElementImage , a _BaseElementText , a _TitleRegExpElementPattern , a _ImageRegExpElementPattern , a _TextRegExpElementPattern , a _ElementChecker , a _SubElementCheckerToRemove , a _ClearTextPatterns ) {
const p _tag = 'p' ;
const grub _func = GrubTextFuncTemplate ( ) ;
let content = '' ;
const paragraph _finish _text _func = FinishWorkFuncTemplate ( p _tag , 'justify' , ClearTextFuncTemplate ( a _ClearTextPatterns , true ) , a _SubElementCheckerToRemove ) ;
content += GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , a _TitleRegExpElementPattern , a _BaseElementTitle ) , grub _func , title _finish _text _func ) ;
content += GetImageInContainers ( FindElementsByRegExp ( GetElementClassName , a _ImageRegExpElementPattern , a _BaseElementImage ) , 'center' ) ;
content += GetContentInContainers ( FIlterElements ( FindElementsByRegExp ( GetElementClassName , a _TextRegExpElementPattern , a _BaseElementText ) , a _ElementChecker ) , grub _func , paragraph _finish _text _func ) ;
return content ;
}
// Создание контента для сайта
function MakeContent ( ) {
let content = '' ;
let source _add = true ;
const zero _tag _func = FinishWorkFuncTemplate ( )
const grub _text _func = GrubTextFuncTemplate ( )
let host _name = null ;
if ( location . hostname == 'tass.ru' ) {
// test: https://tass.ru/proisshestviya/19117971
const base _element = document . getElementById ( 'content_box' ) ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element ,
'(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*' ,
'Image_wrapper_.*' ,
'(Paragraph_paragraph|Title_title).*' ,
ElementCheckerTrue ,
ElementCheckerFalse ,
[ '/ТАСС/. ' ]
) ;
}
else if ( location . hostname == 'ria.ru' ) {
// test: https://ria.ru/20231020/ssha-1904210900.html
const base _element = document . getElementsByClassName ( 'article__header' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
const tire = [ '-' , '–' , '—' , '‒' , '―' , '⸺' , '⸻' ] ;
let clear _text = [ ] ;
for ( let i in tire ) {
let t = tire [ i ] ;
clear _text . push ( t + ' РИА Новости. ' ) ;
}
content = MakeContentByNews (
base _element ,
base _element ,
base _element _text ,
'article__title' ,
'photoview__open' ,
'article__block' ,
ElementCheckerRia ,
ElementCheckerFalse ,
[ '</strong>' ]
) ;
}
else if ( location . hostname == 'rg.ru' ) {
// test: https://rg.ru/2023/10/28/volontery-iz-evrosoiuza-privezli-dlia-zhitelej-donbassa-20-tonn-gumanitarnogo-gruza.html
const base _element = document . getElementsByClassName ( 'article__header' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
content = MakeContentByNews (
document ,
document ,
base _element _text ,
'.*Content_title.*' ,
'.*(Content_image|RgPhotoreportClassic).*' ,
'(PageContentCommonStyling_text|.*Content_lead).*' ,
ElementCheckerTrue ,
SubElementCheckerToRemoveTemplate (
'portal|rg-incut|article-img|Section' ,
'RG-VIDEO|RG-INCUT|RG-PHOTOREPORT'
) ,
[ ]
) ;
}
else if ( location . hostname == 'russian.rt.com' ) {
// test: https://russian.rt.com/business/article/1222163-centrobank-stavka-oktyabr-2023
const base _element = document . getElementsByClassName ( 'article article_article-page' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element _text ,
'article__heading' ,
'article__cover article__cover_article-page' ,
'article__text' ,
ElementCheckerTrue ,
SubElementCheckerToRemoveTemplate (
'read-more|article__cover'
) ,
[ ]
) ;
if ( content . length == 0 ) {
// test: https://russian.rt.com/inotv/2023-10-27/DELFI-Latviya-budet-konfiskovivat-mashini
const base _element _title = document . getElementsByClassName ( 'left-column page' ) [ 0 ] . getElementsByTagName ( "h1" ) [ 0 ] ;
const base _element _image = document . getElementsByTagName ( "figure" ) [ 0 ] ;
const base _element _text = document . getElementsByTagName ( "article" ) [ 0 ] ;
content = title _finish _text _func ( base _element _title ) +
MakeContentByNews (
base _element _title ,
base _element _image ,
base _element _text ,
'!!!!' ,
'.*' ,
'article-intro|article-body' ,
ElementCheckerTrue ,
SubElementCheckerToRemoveTemplate (
'meta' ,
'IMG'
) ,
[ ]
) ;
if ( content . length > 0 ) {
host _name = 'inotv' ;
}
}
}
else if ( location . hostname == 'www.cnews.ru' ) {
// test: https://www.cnews.ru/news/top/2023-10-27_rossiyane_sozdali_polnotsennyj
const base _element = document . getElementsByClassName ( 'news_containere' ) [ 0 ] ;
content = MakeContentByNews (
base _element ,
base _element ,
document ,
'!!!' , // Нет названия
'img-block' ,
'news_container' ,
ElementCheckerTrue ,
SubElementCheckerToRemoveTemplate (
'article-top-author|article-menu_base|d-flex|img-block|NewsBodyLeftInclude|mobile-zone|other-news-note|cnLike|article-bottom-info|banner|comments_all' ,
'NOINDEX|BR'
) ,
[ ]
) ;
}
else if ( location . hostname == 'mixednews.ru' ) {
// test: https://mixednews.ru/archives/180224
const base _element = document . getElementsByClassName ( 'entry-header' ) [ 0 ] ;
content = MakeContentByNews (
base _element ,
document ,
document ,
'entry-title' , // Нет названия
'entry-content' ,
'entry-content' ,
ElementCheckerTrue ,
SubElementCheckerToRemoveTemplate (
'ssba' ,
'NOSCRIPT|SCRIPT|BR|IMG|!--'
) ,
[ ]
) ;
}
else if ( location . hostname == 'iz.ru' ) {
// https://iz.ru/1639291/2024-01-24/amerikanskii-esminetc-uss-john-finn-proshel-cherez-taivanskii-proliv
const base _element = document . getElementById ( 'block-purple-content' ) ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element ,
'm-t-10|top_big_img_article__info__inside__title' ,
'big_photo__img|top_big_img_article__img' ,
'text-article__inside' ,
ElementCheckerTrue ,
SubElementCheckerToRemoveTemplate (
'more_style_one|igi-player|share_bottom|recommendation-block|slider-block|layer-' //, 'DIV|IFRAME'
) ,
[ ]
) ;
}
else if ( location . hostname == 'zakonvremeni.ru' ) {
const base _element = document . getElementsByClassName ( 'item-page' ) [ 0 ] ;
const title = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'page-header' , base _element ) , grub _text _func , FinishWorkFuncZV ) ;
const parent _category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'parent-category-name' , base _element ) , grub _text _func , FinishWorkFuncZV ) ;
const category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'category-name' , base _element ) , grub _text _func , FinishWorkFuncZV ) ;
const page = RemoveAfterSplitter ( TrimString ( document . getElementsByClassName ( 'item-page' ) [ 0 ] . querySelector ( '[itemprop=articleBody]' ) . textContent ) , '.' , true ) ;
content = title + '\n' + parent _category + ' ' + category + '\n\n' + page + '\n' + document . URL ;
source _add = false ;
}
let result = '' ;
if ( content . length > 0 ) {
result = '<textarea id = "news_content" rows="10" cols="100">' + content ;
if ( source _add ) {
result += '<p style="text-align: justify;">Источник: <a href = "' + ClearUrl ( document . URL ) + '">' + ClearWWW ( host _name || location . hostname ) + '</a></p>' ;
}
result += '</textarea>' ;
}
return result ;
}
let content = MakeContent ( ) ;
let news _text = document . createElement ( "div" ) ;
news _text . innerHTML = '<div style="margin: 0pt auto; width: 800px; text-align: center;">' + content + '</div>' ;
if ( location . hostname == 'iz.ru' ) {
let top _panel = document . getElementsByClassName ( 'top-panel' ) [ 0 ] ;
top _panel . parentNode . removeChild ( top _panel ) ;
let base _child = document . getElementsByClassName ( 'm-t-10' ) [ 0 ] ;
if ( ! base _child ) {
base _child = document . getElementsByClassName ( 'top_big_img_article__info__inside__title' ) [ 0 ] ;
}
base _child . appendChild ( news _text ) ;
}
else {
let base _child = document . body . firstChild
document . body . insertBefore ( news _text , base _child ) ;
}
} ) ( ) ;