Analyse de sentiments avec R/test script

De EduTech Wiki
Aller à : navigation, rechercher

Voici un script test presque prêt à l'emploi. Il faut changer le working directory ou le commenter. Vous pouvez aussi changer de catégorie ou de wiki. Ce code devrait aussi marcher avec Wikipedia en utilisant un URL comme:

http://en.wikipedia.org/w/api.php?action=parse&pageid=
# Test with Mediapospap
# options(encoding = 'UTF-8')
# ---- set working directory
# Linux
# setwd ("~/schneide/methodo/R")
# Windows
setwd("s:/methodo/R")
setwd("c:/dks/methodo/R")
getwd()

library("sentiment")
library(tm)
library(XML)
library(tm.plugin.webmining)

# Get list of position papers from EduTechpospap
cat_pospap <- "http://edutechwiki.unige.ch/mediawiki/api.php?action=query&list=categorymembers&cmtitle=Category:Position_paper&cmlimit=500&cmtype=page&format=xml"
XML_list <- xmlTreeParse(cat_pospap,useInternalNodes = TRUE) 
XML_list
XML2_list <- xpathSApply(XML_list, "//cm")
title_list = sapply(XML2_list, function(el) xmlGetAttr(el, "title"))
id_list = sapply(XML2_list, function(el) xmlGetAttr(el, "pageid"))
title_list[[1]]
id_list[[1]]

# --- Identify the URLs for each page (article)
# début et fin de l'URL. Notez le "pageid" qui va nous sortir un article avec sa "pageid"
url_en_start <- "http://edutechwiki.unige.ch/mediawiki/api.php?action=parse&pageid="
url_en_end <- "&format=xml"
article_ids_list <- character(length(id_list))

for (i in 1:length(id_list)) {
  article_ids_list[i] <- (paste (url_en_start, id_list[i], url_en_end, sep=""))
}
# This is the list of articles
article_ids_list

# Define a reader function that will only read the "text" element
readMWXML <- 
  readXML (spec = list (content = list ("node", "//text"),
                        heading = list ("attribute", "//parse/@title")
  ),
  doc=PlainTextDocument())

# ----- download the page contents
pospap.source <- VCorpus(URISource(article_ids_list, encoding="UTF-8"),
                       readerControl=list(reader=readMWXML, language="en"))
names(pospap.source)

# On change les "id" (titres à la place d'URLs illisibles)
for (j in seq.int (pospap.source)) {
  meta(pospap.source[[j]],"id") <- title_list[j]
}

names(pospap.source)

# Ajouter une balise html autour du tout - c'est du bon vodoo
pospap.source <- tm_map (pospap.source, encloseHTML)
# Ecrire les fragments HTML dans des fichiers (inutile, mais permet l'inspection)
writeCorpus(pospap.source, path="./wiki_pospap_source")

# ------------------------------- Clean text into bags of words

pospap.cl1 <- tm_map(pospap.source, content_transformer(tolower))
pospap.cl2 <- tm_map(pospap.cl1, content_transformer(extractHTMLStrip))
pospap.cl2 <- tm_map (pospap.cl2, removePunctuation, preserve_intra_word_dashes = TRUE)
# curly quotes = \u2019
(kill_chars <- content_transformer (function(x, pattern) gsub(pattern, " ", x)))
pospap.cl2 <- tm_map (pospap.cl2, kill_chars, "\u2019")
pospap.cl2 <- tm_map (pospap.cl2, kill_chars,"'")
pospap.cl2 <- tm_map (pospap.cl2, kill_chars,"\\[modifier\\]")
pospap.cl2 <- tm_map (pospap.cl2, kill_chars,"[«»”“\"]")

pospap.essence <- tm_map (pospap.cl2, removeWords, stopwords("english"))
# pospap.roots <- tm_map (pospap.essence, stemDocument, language="english")
pospap.roots <- tm_map (pospap.essence, stripWhitespace)
writeCorpus(pospap.roots, path="./wiki_pospap")

# test
pospap.roots[[1]]
class(pospap.roots)
sentiment(as.character(pospap.roots[[1]]))

# Analyse each text in the corpus
for (docN in seq.int (pospap.roots)) {
  print (paste0 (
        "[[:en:",
        pospap.roots[[docN]]$meta$heading, "|",
        pospap.roots[[docN]]$meta$heading, "]]",
         " = ",
        sentiment (pospap.roots[[docN]]$content)))
}