Module:Citation/CS1/COinS

The educational technology and digital learning wiki
Jump to navigation Jump to search

Documentation for this module may be created at Module:Citation/CS1/COinS/doc

  1 local coins = {};
  2 
  3 
  4 --[[--------------------------< F O R W A R D   D E C L A R A T I O N S >--------------------------------------
  5 ]]
  6 
  7 local is_set, in_array, remove_wiki_link;										-- functions in Module:Citation/CS1/Utilities
  8 
  9 local cfg;																		-- table of configuration tables that are defined in Module:Citation/CS1/Configuration
 10 
 11 
 12 --[[--------------------------< S T R I P _ A P O S T R O P H E _ M A R K U P >--------------------------------
 13 
 14 Strip wiki italic and bold markup from argument so that it doesn't contaminate COinS metadata.
 15 This function strips common patterns of apostrophe markup.  We presume that editors who have taken the time to
 16 markup a title have, as a result, provided valid markup. When they don't, some single apostrophes are left behind.
 17 
 18 ]]
 19 
 20 local function strip_apostrophe_markup (argument)
 21 	if not is_set (argument) then return argument; end
 22 
 23 	if argument:find ( "''", 1, true ) == nil then								-- Is there at least one double apostrophe?  If not, exit.
 24 		return argument;
 25 	end
 26 
 27 	while true do
 28 		if argument:find ( "'''''", 1, true ) then								-- bold italic (5)
 29 			argument=argument:gsub("%'%'%'%'%'", "");							-- remove all instances of it
 30 		elseif argument:find ( "''''", 1, true ) then							-- italic start and end without content (4)
 31 			argument=argument:gsub("%'%'%'%'", "");
 32 		elseif argument:find ( "'''", 1, true ) then							-- bold (3)
 33 			argument=argument:gsub("%'%'%'", "");
 34 		elseif argument:find ( "''", 1, true ) then								-- italic (2)
 35 			argument=argument:gsub("%'%'", "");
 36 		else
 37 			break;
 38 		end
 39 	end
 40 	return argument;															-- done
 41 end
 42 
 43 
 44 --[[--------------------------< M A K E _ C O I N S _ T I T L E >----------------------------------------------
 45 
 46 Makes a title for COinS from Title and / or ScriptTitle (or any other name-script pairs)
 47 
 48 Apostrophe markup (bold, italics) is stripped from each value so that the COinS metadata isn't corrupted with strings
 49 of %27%27...
 50 
 51 ]]
 52 
 53 local function make_coins_title (title, script)
 54 	if is_set (title) then
 55 		title = strip_apostrophe_markup (title);								-- strip any apostrophe markup
 56 	else
 57 		title='';																-- if not set, make sure title is an empty string
 58 	end
 59 	if is_set (script) then
 60 		script = script:gsub ('^%l%l%s*:%s*', '');								-- remove language prefix if present (script value may now be empty string)
 61 		script = strip_apostrophe_markup (script);								-- strip any apostrophe markup
 62 	else
 63 		script='';																-- if not set, make sure script is an empty string
 64 	end
 65 	if is_set (title) and is_set (script) then
 66 		script = ' ' .. script;													-- add a space before we concatenate
 67 	end
 68 	return title .. script;														-- return the concatenation
 69 end
 70 
 71 
 72 --[[--------------------------< E S C A P E _ L U A _ M A G I C _ C H A R S >----------------------------------
 73 
 74 Returns a string where all of lua's magic characters have been escaped.  This is important because functions like
 75 string.gsub() treat their pattern and replace strings as patterns, not literal strings.
 76 ]]
 77 
 78 local function escape_lua_magic_chars (argument)
 79 	argument = argument:gsub("%%", "%%%%");										-- replace % with %%
 80 	argument = argument:gsub("([%^%$%(%)%.%[%]%*%+%-%?])", "%%%1");				-- replace all other lua magic pattern characters
 81 	return argument;
 82 end
 83 
 84 
 85 --[[--------------------------< G E T _ C O I N S _ P A G E S >------------------------------------------------
 86 
 87 Extract page numbers from external wikilinks in any of the |page=, |pages=, or |at= parameters for use in COinS.
 88 
 89 ]]
 90 
 91 local function get_coins_pages (pages)
 92 	local pattern;
 93 	if not is_set (pages) then return pages; end								-- if no page numbers then we're done
 94 	
 95 	while true do
 96 		pattern = pages:match("%[(%w*:?//[^ ]+%s+)[%w%d].*%]");					-- pattern is the opening bracket, the url and following space(s): "[url "
 97 		if nil == pattern then break; end										-- no more urls
 98 		pattern = escape_lua_magic_chars (pattern);								-- pattern is not a literal string; escape lua's magic pattern characters
 99 		pages = pages:gsub(pattern, "");										-- remove as many instances of pattern as possible
100 	end
101 	pages = pages:gsub("[%[%]]", "");											-- remove the brackets
102 	pages = pages:gsub("–", "-" );							-- replace endashes with hyphens
103 	pages = pages:gsub("&%w+;", "-" );						-- and replace html entities (&ndash; etc.) with hyphens; do we need to replace numerical entities like &#32; and the like?
104 	return pages;
105 end
106 
107 
108 --[=[-------------------------< C O I N S _ R E P L A C E _ M A T H _ S T R I P M A R K E R >------------------
109 
110 There are three options for math markup rendering that depend on the editor's math preference settings.  These
111 settings are at [[Special:Preferences#mw-prefsection-rendering]] and are
112 	PNG images
113 	TeX source
114 	MathML with SVG or PNG fallback
115 
116 All three are heavy with html and css which doesn't belong in the metadata.
117 
118 Without this function, the metadata saved in the raw wikitext contained the rendering determined by the settings
119 of the last editor to save the page.
120 
121 This function gets the rendered form of an equation according to the editor's preference before the page is saved.  It
122 then searches the rendering for the text equivalent of the rendered equation and replaces the rendering with that so
123 that the page is saved without extraneous html/css markup and with a reasonably readable text form of the equation.
124 
125 When a replacement is made, this function returns true and the value with replacement; otherwise false and the intital
126 value.  To replace multipe equations it is necesary to call this function from within a loop.
127 
128 ]=]
129 
130 local function coins_replace_math_stripmarker (value)
131 	local stripmarker = '\127UNIQ%-%-math%-[%a%d]+%-QINU\127';					-- math stripmarker pattern
132 	local rendering = value:match (stripmarker);								-- is there a math stripmarker
133 
134 	if not rendering then														-- when value doesn't have a math stripmarker, abandon this test
135 		return false, value;
136 	end
137 	
138 	rendering = mw.text.unstripNoWiki (rendering);								-- convert stripmarker into rendered value (or nil? ''? when math render error)
139 	
140 	if rendering:match ('alt="[^"]+"') then										-- if PNG math option
141 		rendering = rendering:match ('alt="([^"]+)"');							-- extract just the math text
142 	elseif rendering:match ('$%s+.+%s+%$') then									-- if TeX math option; $ is legit character that is escapes as \$
143 		rendering = rendering:match ('$%s+(.+)%s+%$')							-- extract just the math text
144 	elseif rendering:match ('<annotation[^>]+>.+</annotation>') then			-- if MathML math option
145 		rendering = rendering:match ('<annotation[^>]+>(.+)</annotation>')		-- extract just the math text
146 	else
147 		return false, value;													-- had math stripmarker but not one of the three defined forms
148 	end
149 	
150 	return true, value:gsub (stripmarker, rendering, 1);
151 end
152 
153 
154 --[[--------------------------< C O I N S _ C L E A N U P >----------------------------------------------------
155 
156 Cleanup parameter values for the metadata by removing or replacing invisible characters and certain html entities.
157 
158 2015-12-10: there is a bug in mw.text.unstripNoWiki ().  It replaced math stripmarkers with the appropriate content
159 when it shouldn't.  See https://phabricator.wikimedia.org/T121085 and Wikipedia_talk:Lua#stripmarkers_and_mw.text.unstripNoWiki.28.29
160 
161 TODO: move the replacement patterns and replacement values into a table in /Configuration similar to the invisible
162 characters table?
163 
164 ]]
165 
166 local function coins_cleanup (value)
167 	local replaced = true;														-- default state to get the do loop running
168 	
169 	while replaced do															-- loop until all math stripmarkers replaced
170 		replaced, value = coins_replace_math_stripmarker (value);				-- replace math stripmarker with text representation of the equation
171 	end
172 
173 	value = value:gsub ('\127UNIQ%-%-math%-[%a%d]+%-QINU\127', "MATH RENDER ERROR");	-- one or more couldn't be replaced; insert vague error message
174 	
175 	value = mw.text.unstripNoWiki (value);										-- replace nowiki stripmarkers with their content
176 	value = value:gsub ('<span class="nowrap" style="padding%-left:0%.1em;">&#39;s</span>', "'s");	-- replace {{'s}} template with simple apostrophe-s
177 	value = value:gsub ('&zwj;\226\128\138\039\226\128\139', "'");				-- replace {{'}} with simple apostrophe
178 	value = value:gsub ('\226\128\138\039\226\128\139', "'");					-- replace {{'}} with simple apostrophe (as of 2015-12-11)
179 	value = value:gsub ('&nbsp;', ' ');											-- replace &nbsp; entity with plain space
180 	value = value:gsub ('\226\128\138', ' ');									-- replace hair space with plain space
181 	value = value:gsub ('&zwj;', '');											-- remove &zwj; entities
182 	value = value:gsub ('[\226\128\141\226\128\139]', '')						-- remove zero-width joiner, zero-width space
183 	value = value:gsub ('[\194\173\009\010\013]', ' ');							-- replace soft hyphen, horizontal tab, line feed, carriage return with plain space
184 	return value;
185 end
186 
187 
188 --[[--------------------------< C O I N S >--------------------------------------------------------------------
189 
190 COinS metadata (see <http://ocoins.info/>) allows automated tools to parse the citation information.
191 
192 ]]
193 
194 local function COinS(data, class)
195 	if 'table' ~= type(data) or nil == next(data) then
196 		return '';
197 	end
198 
199 	for k, v in pairs (data) do													-- spin through all of the metadata parameter values
200 		if 'ID_list' ~= k and 'Authors' ~= k then								-- except the ID_list and Author tables (author nowiki stripmarker done when Author table processed)
201 			data[k] = coins_cleanup (v);
202 		end
203 	end
204 
205 	local ctx_ver = "Z39.88-2004";
206 	
207 	-- treat table strictly as an array with only set values.
208 	local OCinSoutput = setmetatable( {}, {
209 		__newindex = function(self, key, value)
210 			if is_set(value) then
211 				rawset( self, #self+1, table.concat{ key, '=', mw.uri.encode( remove_wiki_link( value ) ) } );
212 			end
213 		end
214 	});
215 	
216 	if in_array (class, {'arxiv', 'journal', 'news'}) or (in_array (class, {'conference', 'interview', 'map', 'press release', 'web'}) and is_set(data.Periodical)) or 
217 		('citation' == class and is_set(data.Periodical) and not is_set (data.Encyclopedia)) then
218 			OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:journal";			-- journal metadata identifier
219 			if 'arxiv' == class then											-- set genre according to the type of citation template we are rendering
220 				OCinSoutput["rft.genre"] = "preprint";							-- cite arxiv
221 			elseif 'conference' == class then
222 				OCinSoutput["rft.genre"] = "conference";						-- cite conference (when Periodical set)
223 			elseif 'web' == class then
224 				OCinSoutput["rft.genre"] = "unknown";							-- cite web (when Periodical set)
225 			else
226 				OCinSoutput["rft.genre"] = "article";							-- journal and other 'periodical' articles
227 			end
228 			OCinSoutput["rft.jtitle"] = data.Periodical;						-- journal only
229 			OCinSoutput["rft.atitle"] = data.Title;								-- 'periodical' article titles
230 
231 																				-- these used only for periodicals
232 			OCinSoutput["rft.ssn"] = data.Season;								-- keywords: winter, spring, summer, fall
233 			OCinSoutput["rft.chron"] = data.Chron;								-- free-form date components
234 			OCinSoutput["rft.volume"] = data.Volume;							-- does not apply to books
235 			OCinSoutput["rft.issue"] = data.Issue;
236 			OCinSoutput["rft.pages"] = data.Pages;								-- also used in book metadata
237 
238 	elseif 'thesis' ~= class then												-- all others except cite thesis are treated as 'book' metadata; genre distinguishes
239 		OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:book";					-- book metadata identifier
240 		if 'report' == class or 'techreport' == class then						-- cite report and cite techreport
241 			OCinSoutput["rft.genre"] = "report";
242 		elseif 'conference' == class then										-- cite conference when Periodical not set
243 			OCinSoutput["rft.genre"] = "conference";
244 			OCinSoutput["rft.atitle"] = data.Chapter;							-- conference paper as chapter in proceedings (book)
245 		elseif in_array (class, {'book', 'citation', 'encyclopaedia', 'interview', 'map'}) then
246 			if is_set (data.Chapter) then
247 				OCinSoutput["rft.genre"] = "bookitem";
248 				OCinSoutput["rft.atitle"] = data.Chapter;						-- book chapter, encyclopedia article, interview in a book, or map title
249 			else
250 				if 'map' == class or 'interview' == class then
251 					OCinSoutput["rft.genre"] = 'unknown';						-- standalone map or interview
252 				else
253 					OCinSoutput["rft.genre"] = 'book';							-- book and encyclopedia
254 				end
255 			end
256 		else	--{'audio-visual', 'AV-media-notes', 'DVD-notes', 'episode', 'interview', 'mailinglist', 'map', 'newsgroup', 'podcast', 'press release', 'serial', 'sign', 'speech', 'web'}
257 			OCinSoutput["rft.genre"] = "unknown";
258 		end
259 		OCinSoutput["rft.btitle"] = data.Title;									-- book only
260 		OCinSoutput["rft.place"] = data.PublicationPlace;						-- book only
261 		OCinSoutput["rft.series"] = data.Series;								-- book only
262 		OCinSoutput["rft.pages"] = data.Pages;									-- book, journal
263 		OCinSoutput["rft.edition"] = data.Edition;								-- book only
264 		OCinSoutput["rft.pub"] = data.PublisherName;							-- book and dissertation
265 		
266 	else																		-- cite thesis
267 		OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:dissertation";			-- dissertation metadata identifier
268 		OCinSoutput["rft.title"] = data.Title;									-- dissertation (also patent but that is not yet supported)
269 		OCinSoutput["rft.degree"] = data.Degree;								-- dissertation only
270 		OCinSoutput['rft.inst'] = data.PublisherName;							-- book and dissertation
271 	end
272 																				-- and now common parameters (as much as possible)
273 	OCinSoutput["rft.date"] = data.Date;										-- book, journal, dissertation
274 	
275 	for k, v in pairs( data.ID_list ) do										-- what to do about these? For now assume that they are common to all?
276 --		if k == 'ISBN' then v = clean_isbn( v ) end
277 		if k == 'ISBN' then v = v:gsub( "[^-0-9X]", "" ); end
278 		local id = cfg.id_handlers[k].COinS;
279 		if string.sub( id or "", 1, 4 ) == 'info' then							-- for ids that are in the info:registry
280 			OCinSoutput["rft_id"] = table.concat{ id, "/", v };
281 		elseif string.sub (id or "", 1, 3 ) == 'rft' then						-- for isbn, issn, eissn, etc that have defined COinS keywords
282 			OCinSoutput[ id ] = v;
283 		elseif id then															-- when cfg.id_handlers[k].COinS is not nil
284 			OCinSoutput["rft_id"] = table.concat{ cfg.id_handlers[k].prefix, v };	-- others; provide a url
285 		end
286 	end
287 
288 --[[	
289 	for k, v in pairs( data.ID_list ) do										-- what to do about these? For now assume that they are common to all?
290 		local id, value = cfg.id_handlers[k].COinS;
291 		if k == 'ISBN' then value = clean_isbn( v ); else value = v; end
292 		if string.sub( id or "", 1, 4 ) == 'info' then
293 			OCinSoutput["rft_id"] = table.concat{ id, "/", v };
294 		else
295 			OCinSoutput[ id ] = value;
296 		end
297 	end
298 ]]
299 	local last, first;
300 	for k, v in ipairs( data.Authors ) do
301 		last, first = coins_cleanup (v.last), coins_cleanup (v.first or '');	-- replace any nowiki strip markers, non-printing or invisible characers
302 		if k == 1 then															-- for the first author name only
303 			if is_set(last)  and is_set(first) then								-- set these COinS values if |first= and |last= specify the first author name
304 				OCinSoutput["rft.aulast"] = last;								-- book, journal, dissertation
305 				OCinSoutput["rft.aufirst"] = first;								-- book, journal, dissertation
306 			elseif is_set(last) then 
307 				OCinSoutput["rft.au"] = last;									-- book, journal, dissertation -- otherwise use this form for the first name
308 			end
309 		else																	-- for all other authors
310 			if is_set(last) and is_set(first) then
311 				OCinSoutput["rft.au"] = table.concat{ last, ", ", first };		-- book, journal, dissertation
312 			elseif is_set(last) then
313 				OCinSoutput["rft.au"] = last;									-- book, journal, dissertation
314 			end
315 		end
316 	end
317 
318 	OCinSoutput.rft_id = data.URL;
319 	OCinSoutput.rfr_id = table.concat{ "info:sid/", mw.site.server:match( "[^/]*$" ), ":", data.RawPage };
320 	OCinSoutput = setmetatable( OCinSoutput, nil );
321 	
322 	-- sort with version string always first, and combine.
323 	table.sort( OCinSoutput );
324 	table.insert( OCinSoutput, 1, "ctx_ver=" .. ctx_ver );  -- such as "Z39.88-2004"
325 	return table.concat(OCinSoutput, "&");
326 end
327 
328 
329 --[[--------------------------< S E T _ S E L E C T E D _ M O D U L E S >--------------------------------------
330 
331 Sets local cfg table and imported functions table to same (live or sandbox) as that used by the other modules.
332 
333 ]]
334 
335 local function set_selected_modules (cfg_table_ptr, utilities_page_ptr)
336 	cfg = cfg_table_ptr;
337 
338 	is_set = utilities_page_ptr.is_set;											-- import functions from select Module:Citation/CS1/Utilities module
339 	in_array = utilities_page_ptr.in_array;
340 	remove_wiki_link = utilities_page_ptr.remove_wiki_link;
341 end
342 
343 
344 
345 return {
346 	make_coins_title = make_coins_title,
347 	get_coins_pages = get_coins_pages,
348 	COinS = COinS,
349 	set_selected_modules = set_selected_modules,
350 	}