Дипломная работа: Автоматическое выявление вербальной агрессии в Интернет-коммуникации

Внимание! Если размещение файла нарушает Ваши авторские права, то обязательно сообщите нам

56. Carter Jimmy, Chapter Title: «Religious Tolerance»// Abraham's Children: Yale University Press. (2012)

57. Chetviorkin I., Loukachevitch N. «Extraction of Russian Sentiment Lexicon for Product Meta-Domain» // Proceedings of the 25rd International Conference on Computational Linguistics (COLING'12), 2012, pp. 593-610

58. Colleen Cotter «News talk: investigating the language of journalism»// Cambridge University Press, the Edinburgh Building, UK. - 2010

59. Crawford Jarret T., Pilanski Jane M. «Political Intolerance, Right and Left»// Political Psychology, 2012.

60. Davis Darren W. «Exploring Black Political Intolerance»//. Political Behavior, Vol.17, No.1: Springer, Mar. 1995, pp. 1-22

61. Ganu, G., Elhadad, N., and Marian, A. «Beyond the Stars: Improving Rating Predictions using Review Text Content» // In WebDB (Vol. 9, pp. 1-6), 2009

62. Gibson James L. «Alternative Measures of Political Tolerance: Must Tolerance be `Least-Liked'?»// American Journal of Political Science, Vol.36, No.2: Midwest Political Science Association, May, 1992, pp. 560-577

63. Gibson James L. «Political Intolerance in the context of Democratic Theory»// The Oxford Handbook of Political Science, Jul. 2011.

64. Hu M., Liu B. «Mining and summarizing customer reviews» // International Conference on Knowledge Discovery and Data Mining (ICDM), 2004

65. Jin Wei, Hung Hay Ho, «A novel lexicalized HMM-based learning framework for web opinion mining» // in Proceedings of International Conference on Machine Learning (ICML-2009).

66. Lester Emile, Chapter Title: «The Distinctive Paradox of Religious Tolerance»// Teaching about Religions: University of Michigan Press. (2011)

67. Liu Baodong «Racial Contexts and White Interests: Beyond Black Threat and Racial Tolerance»// Political Behavior, Vol.23, No.2: Springer, Jun. 2001, pp.157-180

68. Loukachevitch N., Levchik A. «Creating a General Russian Sentiment Lexicon» // Proceedings of Language Resources and Evaluation Conference LREC-2016, 2016.

69. McEnery T., Wilson A. «Corpus Linguistics». - Edinburgh: Edinburgh University Press, 2001.

70. Paden John N., Chapter Title: «Religious Tolerance and Conflict Resolution»// Muslim Civic Cultures and Conflict Resolution: Brookings Institution Press. (2005)

71. Persell Caroline Hodges, Green Adam, Gurevich Liena, «Civil Society, Economic Distress, and Social Tolerance»// Sociological Forum, Vol.16, No.1: Springer, Jun.2001, pp.203-230

72. Popescu A. M., Etzioni O., «Extracting product features and opinions from reviews, In Natural language processing and text mining», 2007, pp. 9-28

73. Scaffidi C., Bierhoff K., Chang E., Felker M., Ng H., Jin C., «Red Opal: product-feature scoring from reviews» // In Proceedings of the 8th ACM conference on Electronic commerce, 2007, pp. 182-191

74. Scanlon T.M., Chapter Title «The Difficulty of Tolerance»// Toleration: Princeton University Press. (1996)

75. Scott Jerrie C. «Of nicknames, slurs, and name-calling»// American Speech, Vol. 61, No.2: Duke University Press, Summer, 1986, pp. 172-175

76. Sharma Arvind «Religious Tolerance in Three Contexts»// India International Centre Quarterly, Vol. 22, No.1: India International Centre, Spring 1995, pp. 29-34

77. Sharp Elaine B., Joslyn Mark R. «Culture, Segregation, and Tolerance in Urban America»// Social Science Quarterly, Vol.89, No.3: Wiley, Sept. 2008, pp. 573-591

78. Tautkus Rita E. «Comment, Speech Regulation at the University of California: Void for Vagueness or Overbreadth»// 32 Santa Clara L. Rev. 1259 (1992).

79. Turney P. D., Littman M. L. «Measuring praise and criticism: Inference of semantic orientation from association» // ACM Transactions on Information Systems (TOIS), 21(4):315-346, 2003.

Приложение

Код реализации алгоритма сбора текстовых данных с блог-платформы «Живой Журнал» на языке Python 3.

from bs4 import BeautifulSoup

from urllib.request import Request, urlopen

from selenium import webdriver

import os, requests, urllib.request

import time

from lxml.html import parse

sess = requests.Session()

driver = webdriver.Chrome()

links = []

for i in range(101):

url = 'https://www.livejournal.com/rsearch?page=' + str(i) + '&tags=%D1%84%D0%B5%D0%BC%D0%B8%D0%BD%D0%B8%D0%B7%D0%BC&dateFrom=2018-01-01&dateTo=2019-05-10&searchArea=post'

driver.get(url)

time.sleep(5)

page = driver.page_source

soup = BeautifulSoup(page, 'html.parser')

for i in soup.find_all('a', class_='rsearch-note__caption'):

xpath = i['href']

links.append(xpath)

for i in range(len(links)):

url = links[i]

driver.get(url)

time.sleep(10)

maxi = 0

k = 0

page = driver.find_elements_by_tag_name('div')

for j in range(len(page)):

try:

if len(page[j].text) > maxi:

maxi = len(page[j].text)

k = j

path = 'texts/USA/' + str(links[i][links[i].rfind('/'):-5]) + '.txt'

f = open(path, 'a', encoding='utf8')

try:

f.write(page[k].text)

f.close()

except:

pass

except:

pass

Код реализации алгоритма удаления из текста информации о меню веб-страницы, рекламы, комментариях и тегах на языке Python 3.

import os

for i in range(705, 40348466):

path = str(i) + '.txt'

if os.path.exists(path):

f = open(path, 'r', encoding='utf8')

text = f.readlines()

f.close()

f = open(path, 'w').close()

k = 0

m = 0

text_new = []

for i in range(len(text)):

if text[i] != '\n':

text_new.append(text[i])

text = text_new

words = ['ЯНДЕКС.ДИРЕКТ', '(RU)', 'В СТИЛЕ ЖЖ', 'Обратно', 'Previous', 'RVS', 'Rvs', 'НА ГЛАВНУЮ', 'Предыдущий пост']

for w in words:

text = text_new

for i in range(len(text)):

if w in text[i]:

text_new = text_new[i+1:]

text = text_new

for i in range(len(text)):

if 'Назад\n' in text[i]:

if 'Вперёд\n' in text[i+2]:

text_new = text_new[:i] + text_new[i+3:]

text_new = text_new[:i] + text_new[i+2:]

text = text_new

for l in range(len(text)):

if text[l] == 'Категория:\n':

text_new = text_new[:l] + text_new[l+2:]

if text[l] == 'Категории:\n':

text_new = text_new[:l] + text_new[l+3:]

text = text_new

for i in range(len(text)):

if 'Яндекс.Директ' in text[i]:

k = i

if 'Авторизуйтесь' in text[i] or 'Надоела реклама?' in text[i]:

m = i+1

text_new = text_new[:k] + text_new[m:]

text = text_new

for i in range(len(text)):

if 'МЕТКИ' in text[i] or 'Tags' in text[i] or 'Метки' in text[i]:

text_new = text_new[:i]

else:

if text[i] == 'Подписаться\n':

text_new = text_new[:i-1]

text = text_new

words2 = ['Читать далее', 'ЧИТАТЬ ДАЛЕЕ', 'Archive', 'Friends', 'Profile', 'Memories', 'RSS', 'Архив']

for w in words2:

text = text_new

for i in range(len(text)):

if w in text[i]:

text_new = text_new[i+1:]

text = text_new

f = open(path, 'a', encoding='utf8').write(''.join(text_new))

for i in range(705, 40348466):

path = str(i) + '.txt'

if os.path.exists(path):

f = open(path, 'r', encoding='utf8')

text = f.readlines()

f.close()

if text == []:

os.remove(path)

else:

for i in range(len(text)):

if 'LIVEJOURNAL' in text[i] or 'LiveJournal' in text[i]:

os.remove(path)

Код реализации алгоритма удаления стоп-слов из текстов корпуса на языке Python 3.

import os

from nltk.corpus import stopwords

sw = stopwords.words('russian')

for i in range(705, 40348466):

line = str(i) + '.txt'

if os.path.exists(line):

f = open(line, 'r', encoding='utf8')

text = f.read().lower()

f.close()

words = text.split()

sentence = []

for k in words:

if k not in sw:

sentence.append(k)

text = ' '.join(sentence)

f = open(line, 'w').close()

f = open(line, 'a', encoding='utf8').write(text)

Код реализации алгоритма обработки текстов в корпусе на языке Python 3.

import re

import os

from pymystem3 import Mystem

from nltk.corpus import stopwords

m = Mystem()

def lem_file(n):

line = str(n) + '.txt'

if os.path.exists(line):

f = open(line, 'r', encoding='utf8')

text = f.read().lower()

f.close()

text = text.replace('\n', ' ')

text = text.replace('ё', 'е')

tokens = re.findall('[а-я]+', text)

good_tok = ' '.join(tokens)

sw = stopwords.words('russian')

words = good_tok.split()

sentence = []

for w in words:

if w not in sw:

sentence.append(w)

text = ' '.join(sentence)

lemmas = m.lemmatize(text)

l_new = []

for l in lemmas:

if l!='писать' or l!='написать':

l_new.append(l)

good_lem = ''.join(l_new)

f = open('new/' + line, 'a', encoding='utf8')

f.write(good_lem)

for i in range(316, 132415193):

lem_file(i)

for i in range(316, 132415193):

path = 'new/' + str(i) + '.txt'

if os.path.exists(path):

f = open(path, 'r', encoding='utf8')

text = f.read()

f.close()

if len(text.split())<4:

os.remove(path)

Отрывок из списка языковых единиц, вошедших в тональный словарь для определения вербальной агрессии в тексте.

Упырь, баба, дичь, дурка, мразь, мракобесие, ничтожество, овца, подонок, лицемерный, клоунада, квакать, отморозок, америкос, зрад, перемога, молодчик, бред, забавный, жалкий, идиот, негр, кучка, дурак, тролль, орк, паразит и т.д.

Код реализации наивного байесовского классификатора на языке Python 3.

import math

import re

features = []

features0 = open('list.txt', encoding='utf-8').readlines()

for f in features0:

features.append(f[:-1])

def select_features(corpus1, corpus2, ratio_thresh=2.0):

print('selecting features automatically')

tokens1 = re.findall('\w+', corpus1.lower())

tokens2 = re.findall('\w+', corpus2.lower())

N1 = len(tokens1)

N2 = len(tokens2)

words = set(tokens1)

features = []

total_words = len(words)

counter = 1

for w in words:

if counter % 100 == 0:

print('checking word {} out of {}'.format(counter, total_words))

print('words selected so far:', len(features))

counter += 1

c1 = tokens1.count(w) / N1

c2 = tokens2.count(w) / N2

if c1 / c2 >= ratio_thresh:

features.append(w)

return features

def extract_features(text, features):

tokens = re.findall('\w+', text.lower())

d = {f: 0 for f in features}

for t in tokens:

if t in d:

d[t] += 1

vec = []

for f in features:

vec.append(d[f])

return vec

def add1_sm(counts):

new = {}

for w, f in counts.items():

new[w] = f + 1

return new

def train_nbc(features, corpora, smoothing=True):

print('training NBC')

f_set = set(features)

Params = []

for i, corpus in enumerate(corpora):

tokens = re.findall('\w+', corpus.lower())

N = len([t for t in tokens if t in f_set])

d = {f: 0 for f in features}

for t in tokens:

if t in d:

d[t] += 1

if smoothing:

d = add1_sm(d)

param_vec = []

for f in features:

param_vec.append(d[f] / N)

Params.append(param_vec)

print('done training')

return Params

def classify_log(text, features, classes, priors,

Params, legend=False):

results = []

for i, cla in enumerate(classes):

res = math.log(priors[i])

feature_vec = extract_features(text, features)

for j, f_val in enumerate(feature_vec):

p_val = Params[i][j]

if legend:

print('feature:', features[j],

'f_val:', f_val,

'p_val:', p_val)

res += math.log(p_val ** f_val)

results.append(res)

max_prod = max(results)

c_i = results.index(max_prod)

if legend:

print('classes:', classes)

print('results:', results)

print('choosing', classes[c_i])

return classes[c_i]

for i in range(1, 651):

path = 'train/positive/' + str(i) + '.txt'

f = open(path, 'r', encoding='utf8')

text = f.read()

f.close

f = open('positive_train.txt', 'a', encoding='utf8').write(text)

for i in range(1, 471):

path = 'train/negative/' + str(i) + '.txt'

f = open(path, 'r', encoding='utf8')

text = f.read()

f.close

f = open('negative_train.txt', 'a', encoding='utf8').write(text)

pos_train = open('positive_train.txt', encoding='utf-8').read()

neg_train = open('negative_train.txt', encoding='utf-8').read()

corpora = pos_train, neg_train

classes = 0, 1

Params = train_nbc(features, corpora)

priors = 0.5, 0.5

h = []

for i in range(1, 164):

path = 'test/positive/' + str(i) + '.txt'

text = open(path, 'r', encoding='utf8').read()

label = classify_log(text, features, classes, priors, Params, legend=False)

h.append(label)

for i in range(1, 118):

path = 'test/negative/' + str(i) + '.txt'

text = open(path, 'r', encoding='utf8').read()

label = classify_log(text, features, classes, priors, Params, legend=False)

h.append(label)

def evaluate(h, y, klass='all'):

assert len(h) == len(y), Exception('The length of h ({}) and y ({}) should match'.format(len(h), len(y)))

len_y = len(y)

if klass == 'all':

classes = set(y)

class_dist = {c: y.count(c) / len_y for c in classes}

else:

classes = (klass, )

class_dist = {c: 1 for c in classes}

results = {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}

class_dist = {c: y.count(c) / len_y for c in classes}

for c in classes:

tp = set()

fp = set()

tn = set()

fn = set()

for i, h_i in enumerate(h):

y_i = y[i]

if h_i == c:

if y_i == c:

tp.add((i, h_i))

else:

fp.add((i, h_i))

else:

if y_i == c:

fn.add((i, h_i))

else:

tn.add((i, h_i))

acc = len(tp | tn) / len(tp | tn | fp | fn)

p_denom = len(tp | fp)

if p_denom:

p = len(tp) / p_denom

else:

p = 0

r = len(tp) / len(tp | fn)

results['accuracy'] += acc * class_dist[c]

results['precision'] += p * class_dist[c]

results['recall'] += r * class_dist[c]

f1_denom = (p + r)

if f1_denom:

results['f1'] += (2 * p * r / f1_denom) * class_dist[c]

else:

results['f1'] += 0

return (round(results['accuracy'], 3),

round(results['precision'], 3),

round(results['recall'], 3),

round(results['f1'], 3))

y = [0]*163 + [1]*117

print('all classes:\nacc {} \nprec {} \nrec {} \nf1 {}'.

format(*evaluate(h, y)), end='\n\n')