Analyse de sentiments avec R/test script
Aller à la navigation
Aller à la recherche
Voici un script test presque prêt à l'emploi. Il faut changer le working directory ou le commenter. Vous pouvez aussi changer de catégorie ou de wiki. Ce code devrait aussi marcher avec Wikipedia en utilisant un URL comme:
http://en.wikipedia.org/w/api.php?action=parse&pageid=
# Test with Mediapospap # options(encoding = 'UTF-8') # ---- set working directory # Linux # setwd ("~/schneide/methodo/R") # Windows setwd("s:/methodo/R") setwd("c:/dks/methodo/R") getwd() library("sentiment") library(tm) library(XML) library(tm.plugin.webmining) # Get list of position papers from EduTechpospap cat_pospap <- "http://edutechwiki.unige.ch/mediawiki/api.php?action=query&list=categorymembers&cmtitle=Category:Position_paper&cmlimit=500&cmtype=page&format=xml" XML_list <- xmlTreeParse(cat_pospap,useInternalNodes = TRUE) XML_list XML2_list <- xpathSApply(XML_list, "//cm") title_list = sapply(XML2_list, function(el) xmlGetAttr(el, "title")) id_list = sapply(XML2_list, function(el) xmlGetAttr(el, "pageid")) title_list[[1]] id_list[[1]] # --- Identify the URLs for each page (article) # début et fin de l'URL. Notez le "pageid" qui va nous sortir un article avec sa "pageid" url_en_start <- "http://edutechwiki.unige.ch/mediawiki/api.php?action=parse&pageid=" url_en_end <- "&format=xml" article_ids_list <- character(length(id_list)) for (i in 1:length(id_list)) { article_ids_list[i] <- (paste (url_en_start, id_list[i], url_en_end, sep="")) } # This is the list of articles article_ids_list # Define a reader function that will only read the "text" element readMWXML <- readXML (spec = list (content = list ("node", "//text"), heading = list ("attribute", "//parse/@title") ), doc=PlainTextDocument()) # ----- download the page contents pospap.source <- VCorpus(URISource(article_ids_list, encoding="UTF-8"), readerControl=list(reader=readMWXML, language="en")) names(pospap.source) # On change les "id" (titres à la place d'URLs illisibles) for (j in seq.int (pospap.source)) { meta(pospap.source[[j]],"id") <- title_list[j] } names(pospap.source) # Ajouter une balise html autour du tout - c'est du bon vodoo pospap.source <- tm_map (pospap.source, encloseHTML) # Ecrire les fragments HTML dans des fichiers (inutile, mais permet l'inspection) writeCorpus(pospap.source, path="./wiki_pospap_source") # ------------------------------- Clean text into bags of words pospap.cl1 <- tm_map(pospap.source, content_transformer(tolower)) pospap.cl2 <- tm_map(pospap.cl1, content_transformer(extractHTMLStrip)) pospap.cl2 <- tm_map (pospap.cl2, removePunctuation, preserve_intra_word_dashes = TRUE) # curly quotes = \u2019 (kill_chars <- content_transformer (function(x, pattern) gsub(pattern, " ", x))) pospap.cl2 <- tm_map (pospap.cl2, kill_chars, "\u2019") pospap.cl2 <- tm_map (pospap.cl2, kill_chars,"'") pospap.cl2 <- tm_map (pospap.cl2, kill_chars,"\\[modifier\\]") pospap.cl2 <- tm_map (pospap.cl2, kill_chars,"[«»”“\"]") pospap.essence <- tm_map (pospap.cl2, removeWords, stopwords("english")) # pospap.roots <- tm_map (pospap.essence, stemDocument, language="english") pospap.roots <- tm_map (pospap.essence, stripWhitespace) writeCorpus(pospap.roots, path="./wiki_pospap") # test pospap.roots[[1]] class(pospap.roots) sentiment(as.character(pospap.roots[[1]])) # Analyse each text in the corpus for (docN in seq.int (pospap.roots)) { print (paste0 ( "[[:en:", pospap.roots[[docN]]$meta$heading, "|", pospap.roots[[docN]]$meta$heading, "]]", " = ", sentiment (pospap.roots[[docN]]$content))) }