Module:Categorizer

Documentation for this module may be created at Module:Categorizer/doc

-- version 202103111200 from master @kabwiki

local grapheme_ids = { 
	'Q2545446', --grapheme

	'Q19776628', --Latin-script letter
	'Q9788', --letter
	'Q9398093', --vowel letter
	'Q3841820', --consonant letter
	
	'Q19793459', --Greek letter
	'Q19793988', --Cyrillic letter
	'Q41713761', --Arabic letter
	'Q19793980', --Semitic letter
	'Q41799439', --letter of the Ge'ez script
	
	'Q21092339', --mathematical symbol
	
	'Q41885931', --IPA symbol
	'Q41798684', --kana character
	'Q2373910', --syllabogram
	'Q53764732', --CJK character
	'Q10617810', --punctuation mark
	'Q1668151', --punctuation mark
	'Q3241972', -- character
	
	'Q188725', --ligature
	'Q36975', --glyph
}

local number_ids = { 
	'Q21199', -- natural number
	'Q12503', -- integer
}

local year_ids = { 
	'Q577', -- year
	'Q3186692', --calendar year
	'Q3311614', --century leap year
	'Q19828', --leap year
	'Q2378962', --century year
}


local human_ids = {	
	'Q5', -- human 
	'Q15632617', -- fictional human
	'Q15773317', -- television character
	'Q95074', -- fictional character
	'Q21070598', -- narrative entity
	'Q21070568', -- human who may be fictional
	'Q178885', -- deity
	'Q13405593', -- nature deity
	'Q235113', --angel 
}
local language_ids = {
	'Q34770', -- language 
	'Q33742', -- natural language 
	'Q38058796', -- extinct language 
	'Q2315359', --historical language
	'Q45762', --dead language
	'Q33384', -- dialect
	'Q17376908', --languoid
	'Q4536530', --language
	'Q152559', --macrolanguage
}
local lanfamily_ids = { 
	'Q25295', -- language family
	'Q11820611', --language subfamily
}
local ethnic_ids = { 
	'Q41710', -- ethnic group 
}
local taxon_ids = { 
	'Q16521', -- taxon 
	'Q713623', -- clade 
	'Q23038290', -- fossil taxon
	'Q310890', -- monotypic taxon
	'Q855769', -- strain 
	'Q75913269', -- group or class of strains
}
local country_ids = { 
	'Q6256', -- country 
	'Q3624078', --  sovereign state
	'Q3024240', -- historical country
	'Q417175', --kingdom 
	'Q1048835', -- political territorial entity
	'Q15634554', --state with limited recognition
	'Q107390', --federated state
	'Q35657', -- state of the United States* 
	'Q34876', -- province*
	'Q10864048', --first-level administrative country subdivision*
	'Q6465', -- department of France*
	'Q36784', -- region of France*
	'Q859869', --region of Niger*
	'Q162620', --province of Spain*
	'Q467745', --union territory of India
	
	'Q15239622', --disputed territory
	'Q1970725', --natural region
	'Q1620908', --historical region
	'Q3502482', --cultural region
	
	'Q2577883', --occupied territory
}

local city_ids = { 
	'Q5119', -- capital 
	'Q515', -- city  
	'Q2264924', -- port city
	'Q1549591', -- big city 
	'Q15253706', -- like a city
	'Q7930989', -- city/town
	'Q174844', -- megacity 
	'Q200250', -- metropolis 
	'Q149621', -- district 
	
	'Q1637706', --city with millions of inhabitants
	'Q123705', -- neighborhood
	
	'Q532', -- village
	'Q5084', -- hamlet
	'Q21507948', -- former village
	'Q350895', -- abandoned village
	'Q486972', -- human settlement
	'Q22674925', -- former populated place
	'Q10354598', -- rural settlement
	
	'Q1093829', -- city of the United States
	'Q3327870', -- municipal corporation in the United States 
	'Q21518270', -- state or insular area capital in the United States 
	'Q54935504', --city of Switzerland 
	'Q137773', -- ward of Japan
	'Q2989398', -- commune of Algeria
	'Q484170', -- commune of France
	'Q23759397', --municipality of Bangladesh
	'Q328584', --municipality of Slovenia
	'Q203300', --municipality of Liechtenstein

	'Q2590631', --municipality of Hungary 
	'Q2074737', -- municipality of Spain 
	'Q605291', -- municipality of Niger
	'Q41067667', -- municipality of Tunisia
	'Q3327862', -- urban commune of Morocco
	'Q2989470', -- commune of Mauritania  
	'Q17318027', -- rural commune of Morocco
	
	'Q15284', -- municipality
	'Q3266850', -- commune
	'Q515483', -- baladiyah
	
	'Q30059', --arrondissement
	
	'Q1639634', --local government area of Nigeria* 
	'Q2914501', --department of Niger*
	
	'Q240601', --province of Algeria
	'Q2614970', --district of Algeria
}

local city_properties = { 
	'P150', -- contains administrative territorial entity
	'P1376', -- capital of
	'P190', --twinned administrative body
}

local musical_band_ids = { 
	'Q215380', -- band 
	'Q56816954', --  heavy metal band 
	'Q2088357' -- musical ensemble
}
local album_ids = { 
	'Q482994', -- album 
	'Q208569' -- studio album
}
local tv_ids = { 
	'Q5398426', --television series
	'Q63952888', -- anime television series
	'Q581714', -- animated series
	'Q11424', --film
	'Q17517379', --animated short film
	'Q202866', --animated film 
	'Q24862', --short film
}

local book_ids = { 
	'Q47461344', -- written work 
	'Q179461', -- religious text 
	'Q7725634', --literary work
	'Q14406742', -- comic book series
	'Q21198342', -- manga series 
	'Q10901350', -- anime and manga *
}

local int_work_ids = { 
	'Q15621286', -- intellectual work
	'Q196600', -- media franchise
	'Q1344', -- opera
	'Q58483083', --dramatico-musical work
	'Q2188189', --musical work
	'Q838948', -- work of art
	'Q7889', --video game
}

local religion_ids = { 
	'Q9174', -- religion 
	'Q6957341', -- major religious group
	'Q1189816', -- ethnic religion
	'Q13414953', -- religious denomination
	'Q1192063', -- Islamic schools and branches
	'Q19097', -- sect 
	'Q879146', -- Christian denomination
	'Q1530022', -- religious organization
	'Q3001185', -- Jewish denomination
	'Q5839321', -- religious school of thought
	'Q222516', -- school of Buddhism
	'Q1826286', --religious movement
	'Q209928', --madhhab
}

local script_ids = { 
	'Q8192', -- writing system
	'Q29517555', -- natural script 
	'Q1191702', -- constructed script 
	'Q4004706', -- unicase alphabet
	'Q65045986', -- bicameral alphabet 
	'Q9779', -- alphabet 
	'Q185087', --abjad 
	'Q1049394', -- phonetic writing system
	'Q1191127', --featural writing system
}

local school_ids = { 
	'Q3918', --university 
	'Q1663017', --engineering school 
	'Q38723', --higher education institution 
	'Q847027', --grande école 
	'Q3354859', --collegiate university 
	'Q875538', --public university 
	'Q15936437', -- research university 
	'Q265662', --national university 
	'Q3914', --school 
	'Q57775519', --upper secondary school 
	'Q2511322', --lycée
	'Q4671277', --academic institution
	'Q31855', --research institute
	
	'Q845392', -- polytechnic
	'Q1371037', --institute of technology
}


local hydrography_ids = { 
    'Q4022', --river
    'Q23397', --lake
	'Q165', -- sea
	'Q949819', --ship canal
	'Q12284', -- canal
	'Q1267889', -- waterway
	'Q1233637', -- river mouth
	'Q124714', --spring (water) 
	'Q1322134', --gulf
	'Q9430', --ocean
	'Q15324', --body of water
	'Q9019918',	'Q131681',	'Q4862338',	'Q3267675',	'Q204324',	'Q4366834',	'Q8261440',	'Q211302','Q13586859','Q3215290','Q47053','Q43197',
	'Q39594','Q187223','Q940023','Q5926864','Q37901','Q1210950','Q3705882','Q2507626','Q31615','Q33837','Q13137873','Q2936105','Q204894',
	'Q17018380','Q6341928','Q1140477','Q491713',
}

local orography_ids = { 
	'Q46831', --mountain range 
	'Q8502', --mountain
	'Q1437459', --non-geologically related mountain range  
	'Q39816', --valley
	'Q271669', -- landform
	'Q75520', --plateau
	'Q34763', -- peninsula
	'Q55818', -- impact crater
	'Q3240715', --crater
	
	'Q5107', --continent
	'Q205895', --landmass
	
	'Q150784', --canyon 
    'Q55462971', --fluvial landform
    
    'Q133056' --mountain pass
}

local software_ids = { 
	'Q9135', -- operating system 
	'Q218616', -- proprietary software 
	'Q20983788', -- free operating system 
	'Q7397', -- software 
	'Q341', -- free software 
	'Q6368', -- web browser 
	'Q20825628', -- GNU package
}

local disease_ids = { 
	'Q12136', -- disease 
	'Q18123741', -- infectious disease 
	'Q179630', -- syndrome  
	'Q12136', -- notifiable disease  
	'Q506680', -- endemic disease 
	'Q169872', -- symptom
	'Q1441305', --clinical sign
}

local disease_properties = { 
	'P1995', --health specialty
	'P780', --symptoms
	'P923', --medical examinations
	'P924', --possible treatment
	'P2176', --drug used for treatment
	
--	'P8656', --Symptom Ontology ID
--	'P5082', --Store medisinske leksikon ID
--	'P1692', --ICD-9-CM ()
--	'P1748', --NCI Thesaurus ID
}
 

local website_ids = { 
	'Q35127', --website 
	'Q171', --wiki 
	'Q15633582', --MediaWiki website 
	'Q327349', --web directory 
	'Q4182287', --web search engine 
	'Q10876391', -- Wikipedia language edition 
	'Q19967801', --online service 
	'Q1273203', --email service provider 
	'Q1668024', --service on internet 
	'Q1343205', --file hosting service 
	'Q1210425', --internet hosting service 
	'Q17232649', --news website 
	'Q62694393', --URL shortener 
	'Q193424', --web service 
}

local organization_ids = { 
	'Q4830453', --business 
	'Q6881511', --enterprise 
	'Q17990971', --public enterprise
	'Q18388277', --technology company 
	'Q1589009', --privately held company 
	'Q1110794', --daily newspaper 
	'Q1153191', --online newspaper 
	'Q11032', --newspaper 
	'Q43229', --organization 
	'Q484652', --international organization
	'Q245065', --intergovernmental organization
	'Q15265344', --broadcaster
	'Q2001305', --television channel
	'Q14350', --radio station
	'Q7188', --government
	'Q7210356', --political organisation
	'Q7278', -- political party
	'Q46970', --airline
	'Q157031', --foundation 
	'Q708676', --charitable organization 
	'Q29918292', --cultural organization 
	'Q48204', --voluntary association 
	'Q15911314', --association
	'Q5193377', --cultural institution

	'Q163740', --nonprofit organization
	'Q17127659', --terrorist organization
	'Q1788992', --criminal organization
	
	'Q772547', --armed forces
	'Q17149090', --armed organization
	'Q207320', --paramilitary
	'Q1673189', --irregular military
	
	'Q20857065', -- United States federal agency
	
	'Q327333', --government agency
	
	'Q22687', --bank 
	'Q66344', --central bank
	'Q895526', --organ
	
	'Q476028', --association football club
	'Q17270000', --football club
	'Q847017', --sports club
	'Q4438121', --sports organization
	'Q6979593', --national association football team
	'Q1194951', --national sports team
	'Q15944511', --association football team

	'Q988108', --club
	
	'Q15899789', --principal organ of the United Nations
	'Q15285626', --organization established by the United Nations
}

local organization_properties = { 
	'P749', -- parent organization
	'P355', --subsidiary
	'P159', --headquarters location
	'P1454', --legal form 
}
 
local building_ids = { 
	'Q1802963', --	mansion
	'Q3947', --house
    'Q52177058', --civic building
    'Q16831714', --government building
    
	'Q41176', -- building
	'Q294422', --public building
	'Q1021645', --office building
	'Q79146420', --multistorey building
	'Q11755959', --multi-storey urban building
	'Q18142', --high-rise building

	'Q4989906', --monument
	'Q811979', --architectural structure
	'Q11303', --skyscraper
	
	'Q162875', --mausoleum
	'Q6023295', --funerary structure
	
	'Q44539', --temple 
	'Q867143', --Roman temple 
	'Q5393308', --Buddhist temple
	'Q842402', --Hindu temple
	'Q96352513', --religious building ruin 
	'Q24398318', --religious building 
	'Q1370598', --place of worship
	'Q120560', --minor basilica
	'Q163687', --basilica
	'Q16970', -- church building
	'Q1088552', --Catholic church building 
	'Q108325', --chapel 
	'Q56750657', --hermitage 
	'Q56242063', --Protestant church building
	'Q56242250', -- anglican or episcopal cathedral
	'Q56242045', --Anglican church
	'Q58079064', -- protestant cathedral
	'Q32815', --mosque
	
	'Q1154710', --association football stadium
	
	'Q12819564', --station
	'Q1248784', --airport 
	'Q644371', --international airport 
	'Q94993988', --commercial traffic aerodrome 
	'Q62447', --aerodrome 

	

	
	'Q9259', --UNESCO World Heritage Site 
	
	'Q88291', --citadel
	'Q57821', --fortification
	'Q1784293', -- cordon
	
	'Q1440300', --observation tower 
	'Q12518', --tower 
	'Q33506', --museum 
	'Q200334', --bell tower 
	'Q72926449', --church tower
	'Q797765', --inclined tower 
	
	'Q16560', --palace
	
	'Q15911738', --hydroelectric power station 
	'Q159719', --power station  
	
	'Q3497167', --gravity dam 
	'Q12323', --dam 
	
	'Q483110', --stadium
	'Q641226', --arena
}

local chemical_ids = { 
	'Q11173', -- chemical compound
}

local food_ids = { -- unused for the time being, most wikidata classification use these ids 'as subclass of' instead of 'instance of'
	'Q2095', --food 
	'Q6460735', -- meals 
	'Q13276', --cake
	'Q5159627', --confection
	'Q182940', --dessert
	'Q951964', --food product
	'Q26902770',--Food products
	'Q4498085', --bakery product ()
	'Q25403900', --food ingredient ()
	'Q746549', --dish
	'Q41415', --soup 
	'Q1401891', --viennoiserie ()
	'Q1470834', --fruit preparation ()
	'Q772630', --main course ()
	'Q13270', --biscuit ()
	'Q13266', --cookie ()
	'Q477248', --pastry ()
}

local food_properties = { 
	'P5456', --TasteAtlas ID (P5456)
	'P2012', -- cuisine (P2012)
	'P1821', --Open Food Facts food category ID (P1821)
}

local event_ids = {
	'Q1190554', --occurrence 
	'Q1656682', --event 
	'Q27968055', --recurrent event edition
	
	'Q12184', --pandemic
	'Q44512', --epidemic
	'Q3241045', --disease outbreak
	'Q838718', --city fire
	'Q168983', --conflagration
	'Q3839081', --disaster
	'Q381072', --crisis
	'Q290178', --economic crisis

	
	'Q13418847', --historical event
	'Q625298', --peace treaty
	'Q131569', --treaty
	'Q321839', --accord
	'Q2006324', --agreement

	'Q7157512', --peace conference

	'Q1644573', --pilgrimage
	'Q375011', --religious festival
	'Q132241', --festival
	'Q4801521', --arts festival
	'Q1751626', --theatre festival
	'Q868557', --music festival
	'Q23902005', --literary festival

	'Q1197685', --public holiday
	'Q60075825', --Christian holy day
	'Q94920', --Jewish holiday

	'Q11483816', --annual event
	'Q2558684', --world day
	'Q2673813', --rest day
	'Q1445650', --holiday
	
	'Q1062856', --anniversary 
	'Q200538', --party

	'Q59544', --intangible cultural heritage
	'Q210272', --cultural heritage

	'Q1914636', --activity
	'Q273120', --protest
	'Q10931', --revolution
	'Q124734', --rebellion
	'Q3109572', -- civil resistance
	'Q754479', --nonviolent resistance

	'Q1673271', --regime change
	'Q1510761', --social change

	
	'Q8465', --civil war
	'Q198', --war 
	'Q350604', --armed conflict
	'Q180684', --conflict
	'Q750215', --mass murder
	'Q41397', --genocide
	'Q4817637', --atrocity crime
	'Q2223653', --terrorist attack
	'Q1139665', --political murder
	'Q16738832', --criminal case
	
	'Q188055', --siege
	'Q273976', --blockade 
	'Q645883', --military operation
	'Q28972820', --operation

	
	'Q53706', --robbery
	'Q1371150', --hostage crisis

	
	'Q18608583', --recurring sporting event
	'Q15275719', --recurring event
	
	'Q2495862', --congress
	'Q2761147', --meeting
	
	'Q2618461', --legislative election
	'Q1076105', --general election 
	'Q40231', --election
	
	'Q5257307', --prize
	'Q7191', --Nobel Prize
	'Q618779', --award
	'Q378427', --literary award
	'Q17701409', --economics award
	'Q11448906', --science award
	'Q15229207', --sports award
	'Q19020', --Academy Awards
	'Q96474685', --award for best original music
	
	'Q11796413', --decoration
	'Q193622', --order
	'Q56291528', --state order
	'Q3302125', --state decoration

	'Q1788716', --military decoration
	'Q973011', --campaign medal
	'Q131647', --medal


	
}

local astr_object_ids = {
	'Q634', --planet
	'Q13205267', --planet of the Solar System
	'Q844911', --superior planet
	'Q3504248', --inner planet
	'Q3901935', --inferior planet
	'Q2199', --dwarf planet
	'Q16873378', --planetary body
	'Q3132741', --substellar object
	'Q400144', --planemo
	'Q2537', --natural satellite
	'Q1297322', --satellite
	'Q121750', --gas giant 
	'Q21857994', --giant planet
	'Q30014', --outer planet
	'Q30250610', --object in the outer Solar System

	'Q6999', --astronomical object

}


local categories = {
	["number"] = number_ids,
	["grapheme"] = grapheme_ids,
	["year"] = year_ids,
	["human"] = human_ids,
	["language"] = language_ids,
	["language family"] = lanfamily_ids,
	["ethnic group"] = ethnic_ids,
	["taxon"] = taxon_ids,
	["city"] = city_ids,
	["country"] = country_ids,
	["musical band"] = musical_band_ids,
	["album"] = album_ids,
	["tv"] = tv_ids,
	["book"] = book_ids,
	["intellectual work"] = int_work_ids,
	["religion"] = religion_ids,
	["script"] = script_ids,
	["school"] = school_ids,
	["hydrography"] = hydrography_ids,
	["orography"] = orography_ids,
	["software"] = software_ids,
	["disease"] = disease_ids,
	["website"] = website_ids,
	["organization"] = organization_ids,
	["building"] = building_ids,
	["chemical"] = chemical_ids,
	["event"] = event_ids,
	["astronomical object"] = astr_object_ids,
	["food"] = food_ids,
}

local p_categories = {
	["organization"] = organization_properties,
	["disease"] = disease_properties,
	["food"] = food_properties,
	["city"] = city_properties,
}

local queryindex = {
	"number",
	"grapheme",
	"year",
	"human",
	"language",
	"language family",
	"ethnic group",
	"taxon",
	"city",
	"country",
	"musical band",
	"album",
	"tv",
	"book",
	"intellectual work",
	"religion",
	"script",
	"school",
	"hydrography",
	"orography",
	"software",
	"disease",
	"website",
	"organization",
	"building",
	"chemical",
	"event",
	"astronomical object",
	"food"
}

local p = {}
local lualinq = require "Module:LuaLinq"

function belongsTo(item_ids, category_ids)
	return lualinq.main(category_ids):any(function(x) return lualinq.main(item_ids):any(function(y) return x == y; end) end)
end

function findCategoryName(item_ids)
	for _,k in ipairs(queryindex) do
		if belongsTo(item_ids, categories[k]) then return k end
	end
    return nil
end

function isTaxonCommonName(item)
	return lualinq.main(item.claims["P31"]):where(function(c) return c.mainsnak.datavalue.value['id'] == "Q55983715" and c.qualifiers ~= nil and lualinq.main(c.qualifiers["P642"]):first().datavalue.value['id']  ~= nil ; end):any()
end

function findCategoryNameFromProperties(item)
	for _,k in ipairs(queryindex) do
		local pc = p_categories[k]
		if pc then 
			for __,l in ipairs(pc) do
				if item.claims[l] then return k end
			end
	    end
	end
    return nil
end

local function isSet(var)
	return not (var == nil or (type(var) == 'string' and mw.text.trim(var) == ''))
end

local function getEntityId(args, pargs, unnamed)
	pargs = pargs or {}
	local id = args.item or args.from or (unnamed and mw.text.trim(args[1] or '') or nil)
	if not isSet(id) then
		id = pargs.item or pargs.from or (unnamed and mw.text.trim(pargs[1] or '') or nil)
	end
	if isSet(id) then
		if string.find(id, ":") then -- remove prefix as Property:Pid
			id = mw.text.split(id, ":")[2]
		end
	else
		id = mw.wikibase.getEntityIdForCurrentPage()
	end
	return id
end

function p.main(frame)
	local args = frame.args or frame -- via invoke or require
	local pargs = frame.args and frame:getParent().args or {}
	local id = getEntityId(args, pargs)
	if id == nil then
	 return 'entity not found'
	end
	
	local item = mw.wikibase.getEntity(id)
    
    if item == nil or item.claims == nil then
        return 'entity not found bis'
    end
    
    local queryable_ids = lualinq.main(item.claims["P31"]):select(function(c) return c.mainsnak.datavalue.value['id']; end)
    
    local name = nil
    
    if queryable_ids:any() then
        name = findCategoryName(queryable_ids:toArray())
        
	    if name == nil then --try once more with parent
	         name = findCategoryName(lualinq.main(mw.wikibase.getEntity(queryable_ids:first()).claims["P279"]):select(function(c) return c.mainsnak.datavalue.value['id']; end):toArray())
	    end
	    
	    if name == nil and isTaxonCommonName(item) then
	    	name = "taxon common name"
		end
    end
    
    if name == nil then
        name = findCategoryNameFromProperties(item)
    end
    
    return name
end

-- Aliases
function p.find(frame) return p.main(frame) end
function p.findCategory(frame) return p.main(frame) end

return p