#install.packages(c("openxlsx", "stringr","dplyr", "writexl","ggplot2", "RColorBrewer", "stringi","stopwords", "textstem", "tidytext", "topicmodels", "broom", "textmineR", "tm", "purrr", "future", "furrr", "grid", "gridExtra", "httr", "countrycode", "car", "lmtest", sandwich", "httr2","jsonlite","readr","fs", "scales", "stargazer")
setwd("~/Desktop/Studium/Master/Master Thesis/Data Analysis/Data") # For reproduction, specify your path

#install.packages("openxlsx")

library(openxlsx)
Data_job_ads <- read.xlsx(
  xlsxFile = "Data collection.xlsx",
  sheet    = "Job Advertisments",
  rows     = 4:10000,    
  detectDates = TRUE
)

Data_tto <- read.xlsx(
  xlsxFile = "Data collection.xlsx",
  sheet    = "TTO data",
  cols     = 1:4,       
  rows     = 1:10000,    
  detectDates = TRUE
)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Checks for NAs
missing_values <- Data_job_ads == "" | is.na(Data_job_ads)
any(missing_values)
## [1] TRUE
sum(missing_values)
## [1] 19
Data_job_ads[rowSums(missing_values) > 0, ]
##     DocID    Year       Country                                     Name
## 75     75    2024 United States                  Kansas State University
## 86     86    2024 United States        Washington University in St Louis
## 88     88    2025      Bulgaria                        Trakia University
## 111   111    2025       Germany              Leibniz University Hannover
## 153   153    2025 United States                  Kansas State University
## 155   155    2025 United States               Louisiana State University
## 171   171    2025 United States                      Syracuse University
## 189   189    2025 United States         University of Michigan-Ann Arbor
## 198   198    2025 United States            University of Texas at Austin
## 230   230 No date United States University of Maryland, Baltimore County
## 239    NA      27          <NA>                                     <NA>
##     No_Students Director Staff University Contained_in_THE
## 75        30770        0     1          1               NA
## 86        17012        0     1          1               NA
## 88         8000        0     1          1               NA
## 111       27000        0     1          1               NA
## 153       21000        0     1          1               NA
## 155       42016        1     0          1               NA
## 171       22698        1     0          1               NA
## 189       50095        1     0          1               NA
## 198       42444        0     1          1               NA
## 230       41725        0     1          1               NA
## 239          NA       NA    NA         NA               NA
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Rel_Info
## 75                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  The Senior Intellectual Property Associate is responsible for overseeing patent management activities and is\r\nthe primary contact for KU and KUMC faculty inventors and outside patent counsel for intellectual property\r\nprotection and patent prosecution matters. The Senior Intellectual Property Associate may from time-to-time\r\noversee the work of student interns, administrative staff, or external consultants, Develops internal processes and procedures for patent database management, docketing, patent\r\nMling and maintenance.\r\nResearches, compiles, organizes, and presents information regarding patent portfolio to licensing\r\nteam.\r\nManages an annual patent budget and approves expenditures for legal services.\r\nCoordinates recurring RFP process for external patent counsel in collaboration with KU Procurement\r\nand ORce of the General Counsel.\r\nAdvises licensing staff on patent matters.\r\nEnsures timely Mling of patent applications, oRce action responses, and payment of fees.\r\nEnsures compliance with federal regulations regarding intellectual property.\r\nRemains current on changes to patent law.\r\nEducates faculty inventors and fellow KUCTC staff on intellectual property matters and relevant\r\nchanges to patent law.\r\nCollaborates with college and departments to support a culture of innovation across KU’s campuses.\r\nServes as a team lead and mentor for lower-level intellectual property associates, Bachelor’s degree in life sciences, physical science, engineering, or related Meld. Education may\r\nbe substituted for experience on a year for year basis. Experience used to substitute education is in addition\r\nto any required work experience. Minimum Mve (5) years’ experience with relevant technology transfer or patent law\r\nexperience, Strong communication, analytical, and interpersonal skills.\r\nAbility to foster and maintain cooperative working relationships with faculty, staff, and other\r\nUniversity stakeholders.\r\nComfortable working in a fast-paced environment and able to manage multiple tasks simultaneously.\r\nAbility to have a positive outlook, willingness to foster and maintain a positive team-oriented\r\nenvironment.\r\nDemonstrates initiative, diligence/preparedness, and independent judgment.\r\nAbility to deal with complex problems, collect and analyze data, establish facts, draw valid\r\nconclusions, and implement solutions.\r\nKnowledge of US patent law and patent prosecution activities\r\nKnowledge of US copyright and trademark law
## 86                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Working with and assisting Technology Transfer OOce staH in all activities required for\r\ntransfer of technologies to the biotechnology, pharmaceutical, and medical device\r\nindustries, from initial invention disclosure to licensing.\r\nCompletes Technology Assessment and Commercial/Patent Reassessment on assigned\r\ninvention disclosure within established OTM time frames. This assessment includes an\r\nevaluation of patentability and commercial opportunity. Make recommendations on\r\nappropriate IP protection (patent or copyright) and patent conversions.\r\nWorks with OTM mentor and legal counsel (internal/outside) to protect intellectual\r\nproperty either through patenting or other proper tactics for assigned portfolio. Works\r\nwith OTM mentor and patent coordinator to review patent Tling/prosecution decisions\r\nand communicate with outside law Trms.\r\nDevelops marketing materials and website information on IP-protected cases. Markets\r\nIP and establish contacts with potential licensees. Works with OTM mentor to manage\r\nIP portfolio and decisions on future license potential.\r\nNegotiates key terms and execute simple license/material transfer agreements with\r\nlicensees as directed by Business Development Directors or Business Development\r\nAssociates.\r\nData entry relating to technologies and licensing activities of the business development\r\ndirectors and business development associates.\r\nPerforms other duties as assigned, MBA degree in educational background.\r\nSome business experience in private industry.\r\nAbility to assess the potential for a nascent technology in the commercial arena.\r\nExcellent communication skills and be able to prioritize and manage the assigned\r\ncaseload.\r\nExceptional and demonstrated ability to provide superior customer service to internal\r\nand external clients.\r\nRequired Quali;cations\r\nAdvanced degree in the life science/biomedical/medical/engineering areas or\r\nequivalent work/education combined experience.\r\nUnderstanding of patenting, contracts, and licensing activities, either in a university or\r\na private industry.\r\nAbility to assess the potential for a nascent technology in the commercial arena.
## 88                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      We are looking for a highly experienced IP professional to oversee all aspects of the innovation\r\nlifecycle—from identifying new inventions to commercialising mature patent portfolios. The\r\nsuccessful candidate will shape global IP strategy, draft and prosecute patent claims, manage\r\nrisks, and lead initiatives in technology transfer and IP commercialisation within a deep-tech\r\nresearch environment.\r\nKey Responsibilities\r\nIdentify protectable inventions and draft robust, globally enforceable patent claims.\r\nProtect legacy innovations by securing and maintaining comprehensive coverage for\r\nexisting IP assets.\r\nManage and strategically align a global patent portfolio with R&D and commercial goals.\r\nConduct freedom-to-operate (FTO) analyses and advise on risk mitigation and design-\r\naround strategies.\r\nLead enforcement, opposition, litigation, and licensing activities across multiple\r\njurisdictions.\r\nSupport the H2Start CoE's technology transfer strategy, including advising on spin-offs,\r\njoint ventures, and start-up initiatives.\r\nDraft and review agreements related to licensing, R&D, commercialisation, contract\r\nresearch, and IP services.\r\nAssist with IP due diligence and provide input for fundraising, IPOs, and M&A processes.\r\nMinimum Requirements\r\nDemonstrated experience in preparing and filing patent applications globally, particularly\r\nin: Renewable energy technologies; Solar panel systems; Energy storage and conversion\r\ntechnologies.\r\nBroad expertise in providing consultancy and legal representation for invention protection\r\nacross major technical fields.\r\nSubstantial international experience in both commercial and private patent law, including\r\npatent enforcement and litigation in the EU, US, UK, and other jurisdictions.\r\nProven capability in managing foreign patent registrations and navigating international\r\ncompliance.\r\nExperience representing clients in global patent disputes, including enforcement and\r\ndefense of industrial property rights.\r\nSolid background in handling appeal proceedings before the European Patent Office\r\n(EPO).\r\nPreferred Qualifications\r\nQualification as a European or US Patent Attorney.\r\nDemonstrated success in spinning out deep-tech ventures or negotiating university–\r\nindustry licensing agreements.\r\nFluency in English.\r\nStrong communication and stakeholder-management skills, with the ability to present to\r\nboard-level audiences.
## 111                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Responsibilities:\r\n\r\nExpand the advisory services for researchers at LUH on the protection and utilization of research results, with a focus on software, hardware designs, research data, and other copyright-protected and non-patentable works.\r\n\r\nAdvise researchers on choosing appropriate licenses, taking into account different exploitation goals along the spectrum from open access to commercial use.\r\n\r\nWork closely with the research data management team and the start-up service starting business.\r\n\r\nContribute to the development of internal university processes for securing and exploiting research results, and act as a point of contact for LUH researchers in implementing these processes.\r\n\r\nContribute to the further development of the university’s transfer and IP strategy.\r\n\r\nSupport awareness-raising and training of researchers on intellectual property rights and innovation, and develop appropriate training formats.\r\n\r\nRequirements:\r\n\r\nCompleted university degree in a technical or natural science subject (preferably computer science, electrical engineering, or physics); research experience is an asset.\r\n\r\nStrong affinity for new technologies and market trends.\r\n\r\nKnowledge and experience in intellectual property rights, in particular in the evaluation, protection, and exploitation of research results (copyright and patent law).\r\n\r\nDesirable: in-depth knowledge of the software licensing landscape, including open-source licenses.\r\n\r\nAdvantageous: experience advising researchers, designing and implementing training activities in the IP field, and drafting or negotiating IP contracts.\r\n\r\nAdditional skills:\r\n\r\nExcellent oral and written communication skills in German and English.\r\n\r\nConfident use of standard office software and online collaboration tools.\r\n\r\nStrong organizational skills as well as a structured and results-oriented working style.\r\n\r\nHigh flexibility and adaptability.\r\n\r\nStrong team orientation and service mindset.
## 153                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             The Veterinary Medicine focused Licensing Associate or Senior Licensing Associate plays a crucial role in\r\nthe Kansas State University Research Foundation (KSURF) oJce by facilitating the commercialization of\r\nintellectual property (IP) developed through research and innovation. The incumbent is responsible for\r\nadvancing a portfolio of university technologies into the commercial marketplace by evaluating inventions\r\nand either facilitating licenses with existing commercial entities or assisting in the formation of start-up\r\ncompanies and/or other commercialization mechanisms to advance the technology to market. This role\r\nmanages the licensing process, fosters industry partnerships, and supports commercialization efforts to\r\nmaximize the impact and value of the university's veterinary medicine technological innovations, Bachelor's degree\r\nTwo years of relevant experience\r\nPreferred QualiJcations:\r\nBachelor's degree in veterinary medicine or animal sciences highly desired.\r\nAdvanced degree or relevant certiRcation (e.g., MBA, JD, CLP) preferred.\r\nThree or more years of experience in technology transfer, licensing, or commercialization.\r\nStrong understanding of intellectual property laws, licensing practices, and commercialization\r\nstrategies.\r\nExcellent negotiation, communication, and interpersonal skills.\r\nProven ability to build and maintain professional relationships with industry partners and\r\nstakeholders.\r\nStrong analytical skills with the ability to conduct market research and competitive analysis.\r\nProRciency in using technology transfer and contract management software.\r\nDemonstrated success contributing to a collaborative and dynamic work environment.\r\nRecord of success in identifying, marketing, and transferring technologies for commercialization.
## 155                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  The Director of LSU Innovation & Technology Commercialization is responsible for managing,\r\nencouraging, and supporting the development, disclosure, protection, licensing, and\r\ncommercialization of institutional inventions and intellectual property. Primarily focused on the\r\nday-to-day operations of the office and management of faculty training, agreement processing,\r\nintellectual property protection, marketing, outreach, and licensing.\r\nJob Responsibilities:\r\nWork closely with faculty, students, and the community to develop and maintain strong working\r\nrelationships with departments, colleges, and research centers. Responsible for developing,\r\npreparing, and delivering institutional professional training to help faculty, staff, and the\r\ncommunity commercialize ideas and technologies. Interact directly with faculty to help train them\r\nin the commercialization process. Maintain appropriate documentation in shared resources and\r\nreportable databases. Report to appropriate stakeholders at LSU and others in the community on\r\nsuccesses at LSU in commercializing LSU intellectual property. (30%)\r\nHire, supervise, and provide training to the ITC team within the Office of Innovation & Technology\r\nCommercialization. Administers and monitors relevant budgets. Maintain appropriate\r\ndocumentation in shared resources and reportable databases. Report on personnel and financial\r\nbudgets on a regular basis to appropriate stakeholders at LSU. (25%)\r\nManage and supervise the marketing, licensing, and commercialization of university intellectual\r\nproperty through appropriate channels. Work to achieve year-over-year revenue and metric\r\nimprovements based upon research outputs from university researchers. Maintain appropriate\r\ndocumentation in shared resources and reportable databases. Work diligently to license LSU\r\nintellectual property through as many channels as available. Ensure quick processing for\r\nappropriate forms and documents necessary for the commercialization of LSU intellectual property.\r\nReport on commercialization efforts on a regular basis to appropriate stakeholders at LSU . (20%)\r\nManage and supervise the ITC team to help prepare and file provisional and non-provisional patent\r\napplications through internal resources as well as through managing external legal counsel.\r\nMaintain appropriate documentation in shared resources and reportable databases. Report on\r\npatenting efforts on a regular basis to appropriate stakeholders at LSU. (10%)\r\nManage and supervise the processing of material transfer agreements, non-disclosure agreements,\r\nintellectual property sharing agreements, data use agreements, and other similar types of\r\nagreements. Collaborate with the LSU Office of Sponsored Programs on on grants and contracts to\r\nensure that intellectual property terms are compliant with university rules, regulations and policies.\r\nMaintain appropriate documentation in shared resources and reportable databases. Report on\r\nagreement review and process efforts on a regular basis to appropriate stakeholders at LSU. (10%)\r\nOther duties as assigned by the Associate Vice President for Research - Innovation & Ecosystem\r\nDevelopment. (5%)\r\nMinimum Qualifications:\r\nBachelor's Degree in STEM, Business or related field and seven (7) years experience. At least five\r\n(5) years in a university setting assisting in the management of the technology transfer process.\r\nPreferred Qualifications:\r\nMaster's in Business, STEM, or related field, or a JD and 10 years of experience. At least 10 years in\r\na university setting assisting in the management of the technology transfer process. Managerial role\r\nas related to major job duties
## 171 Syracuse University seeks an experienced and collaborative leader to serve as the inaugural Vice President for Economic & Business Development and lead Syracuse University's newly created Office of Economie & Business Development. \r\nReporting to the Vice Chancellor for Strategie Initiatives and Innovation and working closely with the Chancellor and President, the Vice President will be responsible for fostering and driving economie growth and innovation within the university and the broader community and oversee the development and implementation of strategie initiatives that leverage the university's resources, research, and expertise to support local, regional, and national economic development. The successful candidate will play a pivotal role in advancing the University's regional economic engagement activities, including coordinating the University's partnership with Micron Technology, lnc. and other stakeholders across Central New York's emerging semiconductor and advanced manufacturing ecosystem. The Vice President will work daily to foster innovation and entrepreneurship on campus and across the region, and act to bridge the University's research capabilities and human capital to public- and private-sector stakeholders locally, nationally, and globally. •\tBachelor's degree required and advanced degree in a STEM discipline, business administration, public administration, public poliey, or a related area preferred.\r\n•\tMinimum of 12 years of experience in economic development, corporate relations, or a related field. Eight years of increasingly progressive management experience.\r\n•\tProven track record of successful leadership in economie development initiatives.\r\n•\tStrong understanding of higher education's role in economie development.\r\n•\tDemonstrated ability to build and manage high-performing, cross-functional teams.\r\n•\tExcellent verbal and written communieation and interpersonal skills.\r\n•\tAbility to engage effectively with diverse stakeholders, including senior University leadership.\r\n•\tExperience in grant writing and managing complex budgets.\r\n•\tFamiliarity with the semiconductor, advanced manufacturing, and defense industries is highly preferred. •\tExperience working with or within higher education institutions.\r\n•\tKnowledge of Central New York's economic landscape and key industries.\r\n•\tStrategie thinker with the ability to translate vision into actionable plans.\r\n•\tDemonstrated success leading teams in pursuit of corporate and federal funding opportunities.\r\n•\tProven success in securing funding for economic development initiatives.\r\n•\tFamiliarity with technology transfer and commercialization of university research.\r\n•\tDemonstrated commitment to diversity, equity, inclusion, and accessibility. •\tLead and supervise the newly established Syracuse University Office of Economic & Business Development, including the university's efforts in technology transfer, commercialization, and entrepreneurship.\r\n•\tDevelop and execute a comprehensive economic development strategy that aligns with the university's mission and goals.\r\n•\tSupport activities of the Chancellor's lndustry Advisory Council and serve as key member of the Syracuse University Economic Development Steering Committee.\r\n•\tldentify and cultivate strategic partnerships with industry, government, and community organizations to enhance economic opportunities.\r\n•\tCoordinate and oversee Syracuse University's corporate relationships related to local and regional economic development initiatives.\r\n•\tIn partnership with the Vice President for Research, promote Syracuse University's research and technology capabilities to industry and lead efforts to secure external funding to grow the University's programs related to economic development.\r\n•\tDevelop and implement long term plans, near term operating practices, and policy initiatives intended to fester industry collaboration, technology commercialization, licensing of intellectual property, and technology-based economic development endeavors throughout the region as led by the University.\r\n•\tldentify gaps in university programs based on industry demands and work with internal stakeholders to develop and implement initiatives in response to demand including programs that build research expertise in areas of interest to industry.\r\n•\tIn partnership with Academic Affairs and the Office of Research, collaborate with academic departments and research centers to leverage university resources for economic growth.\r\n•\tIn coordination with the Office of Government and Community Relations, support engagement with government officials and community partners in support of Syracuse University's economic development goals.\r\n•\tldentify and pursue opportunities for university-industry collaborations, particularly in the semiconductor, advanced manufacturing, and defense-related industries, both directly and through Syracuse University's leadership of the Collaboration and Commercialization Center of the NY SMART 1- Corridor Regional Technology and Innovation Hubs.\r\n•\tCultivate and maintain streng relationships with local, regional, and national business leaders.\r\n•\tRepresent Syracuse University in economic development forums, conferences, and committees.\r\n•\tDevelop metrics and reporting systems to track and communicate the impact and return on Syracuse University's economic development initiatives.
## 189                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          The Assistant Director, Corporate Research Support works in Innovation Partnerships and reports to\r\nthe Associate Director, Corporate Research Support Group Lead. The position takes a faculty-centric\r\napproach to identifying, developing, and executing sponsored research and other research partnerships. The\r\nAssistant Director, Corporate Research Support counsels U-M faculty on potential and existing research\r\npartnerships, identiEes and cultivates opportunities for U-M faculty to partner with industry, and structures\r\nresearch partnerships, including negotiating term sheets and draft agreements. The Assistant Director,\r\nCorporate Research Support works closely with the licensing team within Innovation Partnerships as well as\r\nProject Representatives within the ORce of Research and Sponsored Projects and other internal and\r\nexternal stakeholders. Work with U-M faculty to understand their research agenda and potential opportunities\r\nIdentify and cultivate strategic research partnership opportunities for particular U-M faculty and units, Negotiate and structure research relationships with industry partners; draft, review, and negotiate\r\nterm sheets, non-disclosure agreements, and agreements for potential research partnerships\r\nAdvise faculty on implications of research partnerships, such as foreground and background\r\nintellectual property terms; assist faculty in navigating and complying with policy and\r\nregulatory requirements for research relationships\r\nCounsel faculty and other University leadership regarding proposals for, and structuring of,\r\nmajor research centers, including Industry-University research centers\r\nAssist other University personnel in navigating industry-sponsored research engagements, Required Quali=cations\r\nBachelor's and/or Master's degree and/or suRcient knowledge and/or suRcient industry experience\r\nto allow the Assistant Director of Corporate Research Alliances to understand scientiEc and\r\nmarket terminology and trends, or strong desire to learn;\r\nAt least one year relevant experience in licensing, research administration, higher\r\neducation, technology transfer, consulting, or business development in relevant industry, university, or\r\nnonproEt setting;\r\nInterest or experience in contract negotiation;\r\nExceptional communication and negotiation skills;\r\nAbility to professionally interact effectively in both academic and corporate cultures including\r\nwith scientists, research sponsors, industry representatives, early-stage investors, and attorneys;\r\nAbility to work reliably under pressure with tight deadlines independently and as part of a\r\nteam environment;\r\nWorking knowledge of intellectual property rights;\r\nHigh degree of energy and enthusiasm; and\r\nDesire to function as an integral part of a joyous workplace.\r\nDesired Quali=cations\r\nTechnical degree and/or J.D;\r\nCertiEed Research Administrator;\r\nWorking understanding of federal grants, solicitations, and/or federal procurement guidelines.\r\nTwo to Eve years relevant experience in licensing, research administration, higher\r\neducation, technology transfer, consulting, contract negotiation, or business development in relevant\r\nindustry, university or nonproEt setting;\r\nExperience and/or understanding of non-disclosure agreements and industry sponsored\r\nresearch agreements, especially the handling of foreground and background intellectual property,\r\nindirect costs, and publication rights;\r\nExperience with intellectual property licensing agreements, in particular those involving\r\nacademic technology transfer;\r\nFamiliarity with the research enterprise at a large research university suRcient to understand\r\nand navigate the variety of interested stakeholders, underlying economics, and the unique interests of\r\na university;\r\nExperience with data analytics and visualization or strong desire to learn; and\r\nDedication to customer service approach to work, suRcient to enable a faculty-centric approach\r\nto counseling faculty through the discovery, exploration, formation, and execution of industry\r\nresearch relationships
## 198                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 This position will be responsible for generating a pipeline of potential new partnering opportunities for the\r\nUniversity of Texas (UT) at Austin’s research in computational technologies, through marketing of\r\nintellectual property for licensing, and generating new connections for corporate-sponsored research, with\r\nthe objective of advancing UT research in the Telds of AI, computer science, networking and\r\ncommunications, and semiconductors Cultivate new and foster existing relationships with companies to identify solutions to industry’s needs:\r\nintroducing UT-owned AI, computer science, and electrical engineering intellectual property available for\r\nlicensing, promoting and highlighting UT Austin’s research capabilities and infrastructure, and marketing and\r\nsupporting growth of other resources available throughout the institution.\r\nCollaborate with counterparts in Intellectual Property Development to evaluate market opportunity for new\r\ntechnologies and the appropriate commercialization pathway for intellectual property – via licensing, new\r\nventure formation, or corporate-sponsored research. Solicit insight from corporate partners to provide supporting information to the Intellectual Property Development team with an industry perspective when\r\nevaluating potential patent Tlings and prosecution strategies as well as development and commercialization\r\npotential for copyrighted works, including software. Engage and collaborate with the Discovery to Impact\r\nteam to help prepare and develop marketing materials for intellectual property that is available for\r\npartnering.\r\nEngage inventors as key contributors for identifying partnering leads and keeping inventors aware of and\r\ninvolved in commercialization efforts.\r\nCollaborate with Licensing team and Collaborative Research team on potential deal development. Provide\r\ninput about relevant market forces, and suggest opportunities for the most effective and valuable deal terms\r\nbased on market intelligence and the path to market while maintaining a uniTed message to potential\r\ncorporate partners.\r\nRaising industry awareness of the entire Discovery to Impact team, including all services and capabilities\r\noffered by the team and across UT Austin.\r\nOther duties as assigned by the Director of Business Development. Bachelor’s Degree in computer science, data science, electrical engineering, or any other\r\nscience/engineering Teld with a focus on AI, computation, large language models, or semiconductor\r\nmaterials, devices, and manufacturing.\r\nMinimum of one year of industry experience in a technology sector (developing or supporting\r\ncommercialization of software, AI, semiconductor, or large language model or machine-learning-enabled\r\nassets at a company or University).\r\nBasic understanding of early-stage technology development.\r\nKnowledge of basic principles of intellectual property and licensing.\r\nWorking knowledge of open-source software licensing\r\nDemonstrated ability to translate early-stage research into solutions for industry needs, including articulating\r\na software or technology's potential commercial impact to industry and communicating complex scientiTc\r\nconcepts clearly and concisely to audiences with and without technical expertise.\r\nMust be a highly self-motivated professional with exceptional ability to identify priorities and complete\r\nprojects. Capable of simultaneously managing multiple projects within a portfolio at different stages in\r\nexecution. Advanced degree in a science, engineering, or business Teld with a focus on AI, computation, large language\r\nmodels, machine learning, natural language processing, or semiconductor materials, devices, and\r\nmanufacturing.\r\nMore than 1 year of experience working in a business development role at a company with a focus on\r\nsoftware, AI, semiconductor, large language model or machine-learning-enabled technologies, or running\r\ntargeted marketing campaigns for these types of technologies in a University.\r\nExperience and demonstrated success in developing and maintaining professional relationships with\r\ntechnology industry representatives to advance technologies into the market.\r\nAdvanced understanding of software, AI, semiconductor, large language model or machine-learning-enabled\r\ntechnologies.\r\nUnderstanding of industry perspective when developing early, research-stage technologies.\r\nExperience collaborating across teams to develop partnering and commercialization strategies.\r\nKnowledge of market valuation techniques.\r\nExperience identifying and evaluating market opportunities based on feedback from trusted industry\r\nconnections and other sources.\r\nExperience using personal computer applications and AI tools to streamline workiow.\r\nAvailable to travel.\r\nRelevant education and experience may be substituted as appropriate.
## 230                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       The New Ventures Associate will aide in accelerating the commercialization of technology by assisting with the UM Ventures company formation effort at University of Maryland, Baltimore (UMB). The New Ventures Associate will report to the New Ventures Principal/Director and assist in accelerating the launch of new University based start-ups and strategic partnerships/joint ventures based on University intellectual property. UM Ventures is a joint initiative of the MPowering the State Program, bringing the University of Maryland, Baltimore and University of Maryland, College Park together to commercialize discoveries, and create economic impact by engaging partners in industry and social ventures. By encouraging our students and faculty, providing expert advice and business services, more discoveries will reach the market. By engaging directly with external partners we will bring new investment, expanded markets and more startup ventures. This is an exempt, regular position and offers a generous benefits package that includes 22 vacation days, 14 floating and observed holidays, 15 sick days; comprehensive health insurance and retirement options; and tuition remission for employees and their dependents at any of the University System of Maryland schools. Essential Functions:Performs market research, business planning, financial modeling, and technology assessment for University technologies that may be suitable for start-ups. Acts as a liaison with entrepreneurial faculty and strategic partners to identify and facilitate successful start-up activity and obtain research funding. Develops and tracks financial/success metric for UMB startup portfolio companies. Assists Venture leadership in identifying traditional and alternative sources of funding for newly launching UMB startup companies from state, federal, angel, corporate, venture, and other organizations. Assists in creating presentations and developing grant proposals for funding sources. Analyzes start-up business plans and value commercial terms, and assists UMB licensing professionals in reviewing and evaluating business plans submitted by current licensees. Helps to cultivate potential management leadership/teams for new UMB startups. Represents UMB’s interests at state, or industry related conferences, symposiums and workshops. Performs other related duties as assigned. Qualifications:Education: Bachelor’s degree in life sciences, finance, business, or a related discipline. Preferred Education: Master’s degree, MBA, or PhD in life sciences preferred. Experience: Three (3) years of progressively responsible business experience in medical device, biotech, pharmaceutical and/or medical environment with financial modeling experience; working knowledge of business development and capital related issues encountered by start-ups in high tech environments. Experience with medical device development helpful. 
## 239                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <NA>
rm(missing_values)

# Replaces not available data with NA
Data_tto[Data_tto == ""] <- NA
Data_tto[Data_tto == "na"] <- NA

# Merge of both dataframes by university name: Data_job_ads and Data_tto
Data_job_ads_key <- Data_job_ads %>%
  mutate(Name_key = Name %>% str_squish() %>% str_to_lower())

Data_tto_key <- Data_tto %>%
  mutate(Name_key = Name %>% str_squish() %>% str_to_lower())

Data <- Data_job_ads_key %>%
  left_join(
    Data_tto_key %>% select(Name_key, TTO_personnel, TTO_founding_year),
    by = "Name_key"
  ) %>%
  select(-Name_key)   

Merged <- Data_job_ads_key %>%
  left_join(
   Data_tto_key %>% select(Name_key, TTO_personnel, TTO_founding_year),
    by = "Name_key"
  ) %>%
  select(-Name_key)   

Data <- Merged %>%
  relocate(Rel_Info, .after = last_col())

Data$TTO_founding_year <- as.numeric(as.character(Data$TTO_founding_year))
Data$TTO_age <- (2026 - Data$TTO_founding_year)

Data <- relocate(Data, all_of(names(Data)[11]), .before = 10)

# Cleaning function for Institution names, normalizes text and removes whitespaces, hidden characters, Non-brealing spaces. etc.
clean_name <- function(x) {
  x <- as.character(x)
  x <- stringi::stri_trans_nfkc(x)                         # unicode normalize
  x <- stringi::stri_replace_all_fixed(x, "\u00A0", " ")   # NBSP -> normal space
  x <- stringi::stri_replace_all_regex(x, "[\t\r\n]+", " ")# tabs/newlines -> space
  x <- stringi::stri_replace_all_regex(x, "\\p{Cf}+", "")  # remove invisible format chars
  x <- str_replace_all(x, "[[:space:]]+", " ")             # collapse any whitespace
  x <- str_trim(x)                                         # trim ends
  x
}

Data <- Data %>%
  mutate(Name = clean_name(Name))

sum(Data$University == 0)
## [1] NA
Data <- Data %>%
  filter(University != 0)

library(countrycode)
Data <- Data %>%
  mutate(
    Continent = countrycode(Country, origin = "country.name", destination = "continent"),
    Continent = case_when(
      Continent == "Americas" ~ "North America",
      Continent == "Oceania"  ~ "Australia",
      Continent == "Africa"   ~ "South Africa",
      TRUE ~ Continent
    )
  )


Data <- Data %>%
  mutate(
    Year = ifelse(tolower(trimws(as.character(Year))) %in% c("no date", "nodate", "no_date"),
                  NA, as.character(Year))
  )


rm(Data_job_ads_key)
rm(Data_job_ads)
rm(Data_tto_key)
rm(Merged)
rm(Data_tto)
library(writexl)
library(RColorBrewer)
library(ggplot2)

nrow(Data)
## [1] 220
no_countries <- length(unique(Data$Country))
no_countries
## [1] 19
no_instituitions <- length(unique(Data$Name))
no_instituitions
## [1] 164
Job_ads_country <- Data %>%
  group_by(Country) %>%
  summarise(n_ads = n()) %>%
  arrange(desc(n_ads))
Job_ads_country
## # A tibble: 19 × 2
##    Country              n_ads
##    <chr>                <int>
##  1 United States          138
##  2 Germany                 23
##  3 United Kingdom          21
##  4 Australia                8
##  5 South Africa             6
##  6 Ireland                  5
##  7 Japan                    4
##  8 China                    3
##  9 Singapore                2
## 10 Austria                  1
## 11 Belgium                  1
## 12 Bulgaria                 1
## 13 Canada                   1
## 14 Luxembourg               1
## 15 Netherlands              1
## 16 Portugal                 1
## 17 Saudi Arabia             1
## 18 Spain                    1
## 19 United Arab Emirates     1
Job_ads_country$share <- Job_ads_country$n_ads/nrow(Data)

# List of institutions, output as excel file
institutions_list <- unique(Data$Name)

sort(institutions_list)
##   [1] "Auburn University"                                         
##   [2] "Augusta University"                                        
##   [3] "Australian National University"                            
##   [4] "Boise State University"                                    
##   [5] "Brown University"                                          
##   [6] "Cardiff University"                                        
##   [7] "Carl von Ossietzky University Oldenburg"                   
##   [8] "Case Western Reserve University"                           
##   [9] "Chemnitz University of Technology"                         
##  [10] "City Colleges of Chicago"                                  
##  [11] "City of New York University"                               
##  [12] "Columbia University"                                       
##  [13] "Deakin University"                                         
##  [14] "DePaul University"                                         
##  [15] "East Tennessee State University"                           
##  [16] "Eastbaverian University of Technology Amberg-Weiden"       
##  [17] "Eindhoven University of Technology"                        
##  [18] "Emory University"                                          
##  [19] "Florida Polytechnic University"                            
##  [20] "Florida State University"                                  
##  [21] "Foundation Veterinary University Hannover"                 
##  [22] "Friedrich Schiller University Jena"                        
##  [23] "Gustavus Adolphus College"                                 
##  [24] "Heriot-Watt University"                                    
##  [25] "Hiroshima University"                                      
##  [26] "Imperial College London"                                   
##  [27] "Indiana University"                                        
##  [28] "Johns Hopkins University"                                  
##  [29] "Kansas State University"                                   
##  [30] "Karlsruhe Institute of Technology"                         
##  [31] "Keio University"                                           
##  [32] "Kennesaw State University"                                 
##  [33] "Khalifa University"                                        
##  [34] "King Abdullah University of Science and Technology (KAUST)"
##  [35] "Leeds Beckett University"                                  
##  [36] "Lehigh University"                                         
##  [37] "Leibniz University Hannover"                               
##  [38] "Leuphana University of Lüneburg"                           
##  [39] "Louisiana State University"                                
##  [40] "Massachusetts Institute of Technology"                     
##  [41] "Mayo Clinic College"                                       
##  [42] "Merrimack College"                                         
##  [43] "Michigan State University"                                 
##  [44] "Minnesota State University"                                
##  [45] "Missouri University of Science and Technology"             
##  [46] "Mizzou - University of Missouri"                           
##  [47] "Murdoch University"                                        
##  [48] "New York University"                                       
##  [49] "North Carolina State University"                           
##  [50] "Ohio State University (Main campus)"                       
##  [51] "Ohio University (Main campus)"                             
##  [52] "Oklahoma State University"                                 
##  [53] "Oregon State University"                                   
##  [54] "Penn State (Main campus)"                                  
##  [55] "Queen Mary University of London"                           
##  [56] "Rhodes University"                                         
##  [57] "RMIT University"                                           
##  [58] "Rutgers University–New Brunswick"                          
##  [59] "San Jose State University"                                 
##  [60] "Singapur Management University"                            
##  [61] "Stanford University"                                       
##  [62] "State University of New York"                              
##  [63] "Stellenbosch University"                                   
##  [64] "SUNY Binghamton University"                                
##  [65] "Syracuse University"                                       
##  [66] "Technische Universität Ilmenau"                            
##  [67] "Temple University"                                         
##  [68] "Texas Southern University"                                 
##  [69] "The Chinese University of Hong Kong"                       
##  [70] "The Hong Kong Polytechnic University"                      
##  [71] "The Rockefeller University"                                
##  [72] "The University of Chicago"                                 
##  [73] "The University of Tokyo"                                   
##  [74] "The University of Tulsa"                                   
##  [75] "Trakia University"                                         
##  [76] "TU Dresden"                                                
##  [77] "Tufts University"                                          
##  [78] "UCL"                                                       
##  [79] "University at Buffalo"                                     
##  [80] "University College Cork"                                   
##  [81] "University College Dublin"                                 
##  [82] "University for Applied Science München"                    
##  [83] "University for the Creative Arts"                          
##  [84] "University of Alabama at Birmingham"                       
##  [85] "University of Applied Science Darmstadt"                   
##  [86] "University of Arizona"                                     
##  [87] "University of Arkansas"                                    
##  [88] "University of Birmingham"                                  
##  [89] "University of Bremen"                                      
##  [90] "University of Bridgeport"                                  
##  [91] "University of Burgos"                                      
##  [92] "University of California San Francisco"                    
##  [93] "University of California, Berkeley"                        
##  [94] "University of California, Los Angeles"                     
##  [95] "University of California, Merced"                          
##  [96] "University of California, Santa Cruz"                      
##  [97] "University of Cambridge"                                   
##  [98] "University of Central Florida"                             
##  [99] "University of Chicago"                                     
## [100] "University of Cincinnati – Uptown"                         
## [101] "University of Coimbra"                                     
## [102] "University of Cologne"                                     
## [103] "University of Colorado Denver/Anschutz Medical Campus"     
## [104] "University of Delaware"                                    
## [105] "University of Dundee"                                      
## [106] "University of East Westphalia Lippe"                       
## [107] "University of Edinburgh"                                   
## [108] "University of Erlangen-Nuremberg"                          
## [109] "University of Galway"                                      
## [110] "University of Glasgow"                                     
## [111] "University of Göttingen"                                   
## [112] "University of Hawai’i at Mānoa"                            
## [113] "University of Hildesheim"                                  
## [114] "University of Hong Kong"                                   
## [115] "University of Houston"                                     
## [116] "University of Illinois at Urbana-Champaign"                
## [117] "University of Johannesburg"                                
## [118] "University of Kentucky"                                    
## [119] "University of Leicester"                                   
## [120] "University of Liege"                                       
## [121] "University of Louisville"                                  
## [122] "University of Luxembourg"                                  
## [123] "University of Maryland, Baltimore County"                  
## [124] "University of Maryland, College Park"                      
## [125] "University of Massachusetts"                               
## [126] "University of Melbourne"                                   
## [127] "University of Michigan-Ann Arbor"                          
## [128] "University of Minnesota"                                   
## [129] "University of Mississippi"                                 
## [130] "University of New Hampshire"                               
## [131] "University of Nordhausen"                                  
## [132] "University of North Carolina at Chapel Hill"               
## [133] "University of North Texas"                                 
## [134] "University of Oregon"                                      
## [135] "University of Oxford"                                      
## [136] "University of Pennsylvania"                                
## [137] "University of Plymouth"                                    
## [138] "University of Pretoria"                                    
## [139] "University of Rhein-Waal"                                  
## [140] "University of Rochester"                                   
## [141] "University of South Australia"                             
## [142] "University of South Florida"                               
## [143] "University of South Florida (Tampa)"                       
## [144] "University of Southampton"                                 
## [145] "University of Southern California"                         
## [146] "University of Technology Sydney"                           
## [147] "University of Texas at Arlington"                          
## [148] "University of Texas at Austin"                             
## [149] "University of Texas Rio Grande Valley"                     
## [150] "University of the Western Cape"                            
## [151] "University of Utah"                                        
## [152] "University of Vienna"                                      
## [153] "University of Virginia (Main campus)"                      
## [154] "University of Warwick"                                     
## [155] "University of Waterloo"                                    
## [156] "University of Wyoming"                                     
## [157] "University System of New Hampshire"                        
## [158] "Virginia Commonwealth University"                          
## [159] "Waseda University"                                         
## [160] "Washington University in St Louis"                         
## [161] "Wayne State University"                                    
## [162] "Western Sydney University"                                 
## [163] "Yale University"                                           
## [164] "York College of Pennsylvania"
Unique_df <- data.frame(Institution = institutions_list)

write_xlsx(
  Unique_df,
  path = file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Data", # For reproduction, specify your path
                   "Unique Institutions.xlsx")
)

Unique_institutions <- Unique_df

# Number of institutions by country
No_institutions_country <- Data %>%
  group_by(Country) %>%
  summarise(Count = n_distinct(Name)) %>%
  arrange(desc(Count))
print(No_institutions_country)
## # A tibble: 19 × 2
##    Country              Count
##    <chr>                <int>
##  1 United States           94
##  2 Germany                 20
##  3 United Kingdom          16
##  4 Australia                8
##  5 South Africa             5
##  6 Japan                    4
##  7 China                    3
##  8 Ireland                  3
##  9 Austria                  1
## 10 Belgium                  1
## 11 Bulgaria                 1
## 12 Canada                   1
## 13 Luxembourg               1
## 14 Netherlands              1
## 15 Portugal                 1
## 16 Saudi Arabia             1
## 17 Singapore                1
## 18 Spain                    1
## 19 United Arab Emirates     1
# Check for duplicates
sum(No_institutions_country$Count)
## [1] 164
no_instituitions
## [1] 164
No_institutions_country$Share <- (No_institutions_country$Count/ no_instituitions)

# Key descriptive indicators
no_directors <- sum(Data$Director)
share_director <- no_directors/ nrow(Data)
no_staff <- sum(Data$Staff)
share_staff <- no_staff/ nrow(Data)
no_universities <- sum(Data$University)
share_universities <- no_universities/ nrow(Data)

summary_No_students <- summary(Data$No_Students, na.rm = TRUE)
summary_No_students
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     200   15368   27125   36991   44698  468000
Data$TTO_personnel <- suppressWarnings(as.numeric(as.character(Data$TTO_personnel)))
summary_TTO_personnel <- summary(Data$TTO_personnel, na.rm = TRUE)

summary_tto_age <- summary(Data$TTO_age, na.rm = TRUE)
summary_tto_age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.00   20.00   27.00   27.69   37.00   61.00     139
library(dplyr)
library(ggplot2)
library(RColorBrewer)

# Number of unique institutions
# Selecting Top 5 countries according share, aggreagting the rest to "Others"
Top5 <- head(No_institutions_country, 5)
print(Top5)
## # A tibble: 5 × 3
##   Country        Count  Share
##   <chr>          <int>  <dbl>
## 1 United States     94 0.573 
## 2 Germany           20 0.122 
## 3 United Kingdom    16 0.0976
## 4 Australia          8 0.0488
## 5 South Africa       5 0.0305
Others <- No_institutions_country %>%
  slice(6:n()) %>%  
  summarise(
    Country = "Others",
    Count = sum(Count),
    Share = sum(Share))

# Controls if sharesum is equal one 
sum(Top5$Share) + Others$Share
## [1] 1
Plot_df <- bind_rows(Top5, Others)
Plot_df %>%
  mutate(Share = round(Share, 3))
## # A tibble: 6 × 3
##   Country        Count Share
##   <chr>          <int> <dbl>
## 1 United States     94 0.573
## 2 Germany           20 0.122
## 3 United Kingdom    16 0.098
## 4 Australia          8 0.049
## 5 South Africa       5 0.03 
## 6 Others            21 0.128
Plot_df <- Plot_df %>%
  mutate(Country = factor(Country, levels = c(setdiff(Country, "Others"), "Others")))

colors <- brewer.pal(6, "Pastel1")

ggplot(Plot_df, aes(x = "", y = Share, fill = Country)) +
  geom_col(width = 1, color = "white") +         
  coord_polar(theta = "y") +                      
  scale_fill_manual(values = colors) +          
  labs(
    # title = "Share of Unique Institutions by Country",
    fill = "Country"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

   # legend.title = element_text(face = "bold", size = 30),
    legend.title = element_blank(),  # remove legend title
    legend.text  = element_text(size = 30, lineheight = 1.4),  # more spacing between names
    legend.key.size = grid::unit(1.2, "cm"),
    legend.spacing.y = grid::unit(1.25, "cm")                  # more vertical gap between items
  )

ggsave(
  filename = "Piechart_Share_Unique_Institutions_Country.jpg", 
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

# Piechart by number of job advertisments
Top5 <- head(Job_ads_country, 5)
print(Top5)
## # A tibble: 5 × 3
##   Country        n_ads  share
##   <chr>          <int>  <dbl>
## 1 United States    138 0.627 
## 2 Germany           23 0.105 
## 3 United Kingdom    21 0.0955
## 4 Australia          8 0.0364
## 5 South Africa       6 0.0273
Others <- Job_ads_country %>%
  slice(6:n()) %>%  
  summarise(
    Country = "Others",
    n_ads = sum(n_ads),
    share = sum(share))

# Controls if sharesum is equal one 
sum(Top5$share) + Others$share
## [1] 1
Plot_df <- bind_rows(Top5, Others)
Plot_df %>%
  mutate(
    share = round(share, 3)
  )
## # A tibble: 6 × 3
##   Country        n_ads share
##   <chr>          <int> <dbl>
## 1 United States    138 0.627
## 2 Germany           23 0.105
## 3 United Kingdom    21 0.095
## 4 Australia          8 0.036
## 5 South Africa       6 0.027
## 6 Others            24 0.109
Plot_df <- Plot_df %>%
  mutate(Country = factor(Country, levels = c(setdiff(Country, "Others"), "Others")))

colors <- brewer.pal(6, "Pastel1")

ggplot(Plot_df, aes(x = "", y = share, fill = Country)) +
  geom_col(width = 1, color = "white") +         
  coord_polar(theta = "y") +                      
  scale_fill_manual(values = colors) +          
  labs(
   # title = "Share of Job advertisments by Country",
    fill = "Country"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

   # legend.title = element_text(face = "bold", size = 30),
    legend.title = element_blank(),  # remove legend title
    legend.text  = element_text(size = 30, lineheight = 1.4),  # more spacing between names
    legend.key.size = grid::unit(1.2, "cm"),
    legend.spacing.y = grid::unit(1.25, "cm")                  # more vertical gap between items
  )

ggsave(
  filename = "Piechart_Share_Job_ads_Country.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
Plot_df <- Data %>%
  filter(!is.na(Continent), !is.na(Name)) %>%
  distinct(Name, Continent) %>%      # unique institutions
  count(Continent, name = "Count") %>%
  mutate(Share = Count / sum(Count))

colors <- brewer.pal(max(3, nrow(Plot_df)), "Pastel1")[1:nrow(Plot_df)]

ggplot(Plot_df, aes(x = "", y = Share, fill = Continent)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  scale_fill_manual(values = colors) +
  labs(
   # title = "Share of continents by unique institutions",
    fill = "Continent"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

   # legend.title = element_text(face = "bold", size = 30),
    legend.title = element_blank(),  # remove legend title
    legend.text  = element_text(size = 30, lineheight = 1.4),  # more spacing between names
    legend.key.size = grid::unit(1.2, "cm"),
    legend.spacing.y = grid::unit(1.25, "cm")                  # more vertical gap between items
  )

ggsave(
  filename = "Piechart_Share_Unique_Institutions_Continent.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

# Domination of North America and Europe
sum_share_eu_na <- Plot_df %>%
  filter(Continent %in% c("Europe", "North America")) %>%
  summarise(sum_share = sum(Share, na.rm = TRUE))

Plot_df <- Data %>%
  filter(!is.na(Continent)) %>%
  count(Continent, name = "Count") %>%   # counts job ads (rows) by continent
  mutate(Share = Count / sum(Count))

colors <- brewer.pal(6, "Pastel1")

ggplot(Plot_df, aes(x = "", y = Share, fill = Continent)) +
  geom_col(width = 1, color = "white") +         
  coord_polar(theta = "y") +                      
  scale_fill_manual(values = colors) +          
  labs(
   # title = "Share of Job advertisements by Continent",
    fill = "Country"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

   # legend.title = element_text(face = "bold", size = 30),
    legend.title = element_blank(),  # remove legend title
    legend.text  = element_text(size = 30, lineheight = 1.4),  # more spacing between names
    legend.key.size = grid::unit(1.2, "cm"),
    legend.spacing.y = grid::unit(1.25, "cm")                  # more vertical gap between items
  )

ggsave(
  filename = "Piechart_Share_Job_ads_Continent.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)


rm(colors)
rm(Plot_df)
rm(Others)
rm(Top5)
rm(sum_share_eu_na)
# Ensures numeric values (even already done above)
Data$TTO_age      <- as.numeric(as.character(Data$TTO_age))
Data$TTO_personnel <- as.numeric(as.character(Data$TTO_personnel))
Data$No_Students  <- as.numeric(as.character(Data$No_Students))

# Use exactly the rows that will be plotted (no missing aesthetics)
Data_plot <- Data[complete.cases(Data[, c("TTO_age","TTO_personnel","No_Students","Continent")]), ]

# Regression on the same data (Aggregated, no differentiation between continents)
fit  <- lm(TTO_personnel ~ TTO_age, data = Data_plot)
s    <- summary(fit)
ct   <- s$coefficients

n_used  <- nrow(Data_plot)
b0 <- ct["(Intercept)", "Estimate"]; se0 <- ct["(Intercept)", "Std. Error"]; p0 <- ct["(Intercept)", "Pr(>|t|)"]
b1 <- ct["TTO_age",      "Estimate"]; se1 <- ct["TTO_age",      "Std. Error"]; p1 <- ct["TTO_age",      "Pr(>|t|)"]
r2 <- s$r.squared

cap <- paste0(
  "The plot only represents rows with known TTO size and age (n = ", n_used, ").",
  "The Point size \nrepresents the number of students.\n",
  "Linear regression (lm: TTO_size ~ TTO_age):\n",
  "Intercept = ", formatC(b0, digits = 3, format = "f"), " (SE ", formatC(se0, digits = 3, format = "f"), ", p = ", format.pval(p0, digits = 3), ")\n",
  "Age coef  = ", formatC(b1, digits = 3, format = "f"), " (SE ", formatC(se1, digits = 3, format = "f"), ", p = ", format.pval(p1, digits = 3), ")\n",
  "R² = ", formatC(r2, digits = 3, format = "f")
)

ggplot(Data_plot, aes(x = TTO_age, y = TTO_personnel, size = No_Students, color = Continent)) +
  geom_point(alpha = 0.75) +
  geom_smooth(aes(group = 1), method = "lm", se = TRUE, linewidth = 1) +
  labs(
    # title = "The Relationship between TTO Size and Age",
    x = "TTO Age (Operating years by 2026)",
    y = "TTO Size (personnel)",
    color = "Continent",
    caption = cap
  ) +
  guides(size = "none") +
  theme_minimal() +
  theme(
    axis.title   = element_text(size = 12, margin = margin(t=5)),
    axis.text    = element_text(size = 12),
    plot.caption = element_text(size = 12, hjust = 0, lineheight = 1.5, margin = margin(t=10)), # more top margin for caption
    plot.margin  = margin(10, 10, 10, 10),                    # overall bottom space
    legend.title = element_blank(),
    legend.text  = element_text(size = 12, lineheight = 1.4),
    legend.key.size = grid::unit(0.8, "cm"),
    legend.spacing.y = grid::unit(0.3, "cm")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size
## and colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# How many observations are used in the plot (the rest dropped due to NAs)
Data_plot %>%
  dplyr::select(TTO_age, TTO_personnel, No_Students, Continent) %>%
  tidyr::drop_na() %>%
  nrow()
## [1] 65
ggsave(
  filename = "TTO_Scatterplot_Age_Size.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size
## and colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
# Filter to rows that will be plotted
Data_plot_filt <- Data %>%
  mutate(
    TTO_personnel = as.numeric(as.character(TTO_personnel)),
    No_Students   = as.numeric(as.character(No_Students))
  ) %>%
  filter(!is.na(No_Students), No_Students > 0,
         !is.na(TTO_personnel),
         !is.na(Continent))

# Regression on the same data used in the plot
fit <- lm(TTO_personnel ~ log(No_Students), data = Data_plot_filt)
s   <- summary(fit)
ct  <- s$coefficients

n_used <- nrow(Data_plot_filt)
b0 <- ct["(Intercept)", "Estimate"]; se0 <- ct["(Intercept)", "Std. Error"]; p0 <- ct["(Intercept)", "Pr(>|t|)"]
b1 <- ct["log(No_Students)", "Estimate"]; se1 <- ct["log(No_Students)", "Std. Error"]; p1 <- ct["log(No_Students)", "Pr(>|t|)"]
r2 <- s$r.squared

cap <- paste0(
  "The plot only represents rows with known TTO age and number of students (n = ", n_used, ").\n",
  "Linear regression (lm: TTO_size ~ log(No_Students)):\n",
  "Intercept = ", formatC(b0, digits = 3, format = "f"),
  " (SE ", formatC(se0, digits = 3, format = "f"), ", p = ", format.pval(p0, digits = 3), ")\n",
  "log(No_Students) coef = ", formatC(b1, digits = 3, format = "f"),
  " (SE ", formatC(se1, digits = 3, format = "f"), ", p = ", format.pval(p1, digits = 3), ")\n",
  "R² = ", formatC(r2, digits = 3, format = "f")
)

ggplot(Data_plot_filt, aes(x = log(No_Students), y = TTO_personnel, color = Continent)) +
  geom_point(alpha = 0.75) +
  geom_smooth(aes(group = 1), method = "lm", se = TRUE, linewidth = 1) +
  labs(
    x = "Number of Students (log)",
    y = "TTO Size (Personnel)",
    color = "Continent",
   # title = "Relationship between TTO size and the number of students",
    caption = cap
  ) +
  theme_minimal() +
  theme(
    axis.title   = element_text(size = 12, margin = margin(t = 5)),
    axis.text    = element_text(size = 12),

    # extra gap between x label and caption + 1.5 line spacing
    axis.title.x = element_text(margin = margin(t = 15)),
    plot.caption = element_text(size = 12, hjust = 0, lineheight = 1.5, margin = margin(t = 10)),

    plot.margin  = margin(10, 10, 10, 10),
    legend.title = element_blank(),
    legend.text  = element_text(size = 12, lineheight = 1.4),
    legend.key.size = grid::unit(0.8, "cm"),
    legend.spacing.y = grid::unit(0.3, "cm")
  )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggsave(
  filename = "TTO_Scatterplot_Size_Number_Students.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
rm(ct)
rm(fit)
rm(s)
rm(cap)
rm(b0)
rm(b1)
rm(n_used)
rm(se0)
rm(se1)
rm(p0)
rm(p1)
rm(r2)
Data2 <- Data %>%
  filter(TTO_personnel != 155)

Data3 <- Data %>%
  filter(Continent != "Australia")

# Boxplot 
ggplot(Data3, aes(x = Continent, y = TTO_personnel)) +
  labs(x = "Continent", 
       y = "TTO Size (personnel)",
       caption = "Australia is excluded due to missing") +
      # title = "Boxplot_TTO_Size") 
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.15, alpha = 0.5) +
  theme_minimal() +
  theme(
    axis.text  = element_text(size = 12),  
    axis.title = element_text(size = 12),
    plot.caption = element_text(size = 12, hjust = 0, lineheight = 1.5, margin = margin(t=10)) # more top margin for caption
)
## Warning: Removed 117 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 117 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(
  filename = "Boxplot_TTO_Size.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
## Warning: Removed 117 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 117 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Comparison of means
sum_df <- Data2 %>%
  group_by(Continent) %>%
  summarise(
    mean_personnel = mean(TTO_personnel, na.rm = TRUE),
    sd_personnel   = sd(TTO_personnel, na.rm = TRUE),
    n             = sum(!is.na(TTO_personnel)),
    se_personnel   = sd_personnel / sqrt(n),
    .groups = "drop"
  )


ggplot(sum_df, aes(x = Continent, y = mean_personnel)) +
  geom_point() +
  labs(x = "Continent", 
       y = "Mean TTO personnel (± SE)",
       caption = "Two european outlier observations with values of 155 (University of Cambridge) are excluded") +
      # title = "Mean Analysis of TTO Size +
  geom_errorbar(aes(ymin = mean_personnel - se_personnel,
                    ymax = mean_personnel + se_personnel), width = 0.2) +
  theme_minimal() +
   theme(
    axis.text  = element_text(size = 12),  
    axis.title = element_text(size = 12),
    plot.caption = element_text(size = 12, hjust = 0, lineheight = 1.5, margin = margin(t=10)) # more top margin for caption
)

ggsave(
  filename = "Mean_Analysis_TTO_Size.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

Data_plot <- Data2 %>%
  filter(Continent != "Oceania")

# Comprehensive table 
TTO_personnel_summary <- Data_plot %>%
  group_by(Continent) %>%
  summarise(
    n_total  = n(),
    n_na     = sum(is.na(TTO_personnel)),
    n_non_na = sum(!is.na(TTO_personnel)),
    min      = min(TTO_personnel, na.rm = TRUE),
    q25      = quantile(TTO_personnel, 0.25, na.rm = TRUE, type = 2),
    median   = median(TTO_personnel, na.rm = TRUE),
    mean     = round(mean(TTO_personnel, na.rm = TRUE),2),
    q75      = quantile(TTO_personnel, 0.75, na.rm = TRUE, type = 2),
    max      = max(TTO_personnel, na.rm = TRUE),
    .groups  = "drop"
  )

Total_data <- Data %>%
  summarise(
    Continent = "Total Data",
    n_total  = n(),
    n_na     = sum(is.na(TTO_personnel)),
    n_non_na = sum(!is.na(TTO_personnel)),
    min      = min(TTO_personnel, na.rm = TRUE),
    q25      = quantile(TTO_personnel, 0.25, na.rm = TRUE, type = 2),
    median   = median(TTO_personnel, na.rm = TRUE),
    mean     = round(mean(TTO_personnel, na.rm = TRUE),2),
    q75      = quantile(TTO_personnel, 0.75, na.rm = TRUE, type = 2),
    max      = max(TTO_personnel, na.rm = TRUE)
  )

TTO_personnel_summary <- bind_rows(TTO_personnel_summary, Total_data)
TTO_personnel_summary
## # A tibble: 5 × 10
##   Continent     n_total  n_na n_non_na   min   q25 median  mean   q75   max
##   <chr>           <int> <int>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Asia                5     0        5     6  10     11   10.2   11      13
## 2 Europe             16     0       16     3   6     11.5 14.1   18.5    49
## 3 North America      68     0       68     2   8     17   26.2   43.5    77
## 4 South Africa        4     0        4     6   6.5    9    8.75  11      11
## 5 Total Data        220   125       95     2   7     13   25.3   34     155
rm(Total_data)
rm(Data_plot)
rm(Data2)
rm(Data3)
# Boxplot
ggplot(Data, aes(x = Continent, y = TTO_age)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.15, alpha = 0.5) +
  theme_minimal() +
  labs(x = "Continent", y = "TTO Age")
## Warning: Removed 139 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 139 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(
  filename = "Boxplot_TTO_Age.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
## Warning: Removed 139 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 139 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Comparison of means
sum_df <- Data %>%
  group_by(Continent) %>%
  summarise(
    mean_age = mean(TTO_age, na.rm = TRUE),
    sd_age   = sd(TTO_age, na.rm = TRUE),
    n             = sum(!is.na(TTO_age)),
    sd_age   = sd_age / sqrt(n),
    .groups = "drop"
  )

ggplot(sum_df, aes(x = Continent, y = mean_age)) +
  geom_point() +
  geom_errorbar(aes(ymin = mean_age - sd_age,
                    ymax = mean_age + sd_age), width = 0.2) +
  theme_minimal() +
  labs(
       # title = "Average TTO Age",
       x = "Continent", 
       y = "Average TTO Age (± SE)")+
  theme(
    axis.text  = element_text(size = 12),  
    axis.title = element_text(size = 12),
    plot.caption = element_text(size = 12, hjust = 0, lineheight = 1.5, margin = margin(t=10)) # more top margin for caption
)

ggsave(
  filename = "Mean_Analysis_TTO_Age.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

# Comprehensive table
TTO_age_summary <- Data %>%
  group_by(Continent) %>%
  summarise(
    n_total  = n(),
    n_na     = sum(is.na(TTO_age)),
    n_non_na = sum(!is.na(TTO_age)),
    min      = min(TTO_age, na.rm = TRUE),
    q25      = quantile(TTO_age, 0.25, na.rm = TRUE, type = 2),
    median   = median(TTO_age, na.rm = TRUE),
    mean     = round(mean(TTO_age, na.rm = TRUE),2),
    q75      = quantile(TTO_age, 0.75, na.rm = TRUE, type = 2),
    max      = max(TTO_age, na.rm = TRUE),
    .groups  = "drop"
  )

Total_data <- Data %>%
  summarise(
    Continent = "Total Data",
    n_total  = n(),
    n_na     = sum(is.na(TTO_age)),
    n_non_na = sum(!is.na(TTO_age)),
    min      = min(TTO_age, na.rm = TRUE),
    q25      = quantile(TTO_age, 0.25, na.rm = TRUE, type = 2),
    median   = median(TTO_age, na.rm = TRUE),
    mean     = round(mean(TTO_age, na.rm = TRUE),2),
    q75      = quantile(TTO_age, 0.75, na.rm = TRUE, type = 2),
    max      = max(TTO_age, na.rm = TRUE)
  )

TTO_age_summary <- bind_rows(TTO_age_summary, Total_data)
TTO_age_summary
## # A tibble: 6 × 10
##   Continent     n_total  n_na n_non_na   min   q25 median  mean   q75   max
##   <chr>           <int> <int>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Asia               11     5        6     9     9   19.5  18.7    27    28
## 2 Australia           8     7        1    27    27   27    27      27    27
## 3 Europe             56    39       17     6    21   25    25.3    30    40
## 4 North America     139    88       51     5    22   29    30.8    40    61
## 5 South Africa        6     0        6    12    14   17.5  17.7    18    27
## 6 Total Data        220   139       81     5    20   27    27.7    37    61
rm(Total_data)
rm(sum_df)
# Builds a text cleaning function
library(stringi)
library(stopwords)  
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
clean_rel_info <- function(text) {

  # Converts to lowercase
  text <- tolower(text)
  
  # Removes accents but keeps latin and non-latin letters (e.g. Scandinavian, Balkan)
  text <- stri_trans_general(text, "Latin-ASCII")
  
  # Removes commas, line breaks, tabs
  text <- str_replace_all(text, "[\\n\\r\\t,]", " ")
  
  # Removes non-alphabetic letters and signs
  text <- str_replace_all(text, "[^a-z\\s]", " ")
  
  # Removes words with one letter
  text <- str_replace_all(text, "\\b[a-z]\\b", " ")
  
  # Removes double spaces
  text <- str_replace_all(text, "\\s+", " ")
  
  # Trims leading/trailing spaces
  text <- str_trim(text)
  
  # Create stopword list
  sw <- unique(c(stopwords("en")))
  
  # Split text into words
  words <- unlist(str_split(text, "\\s+"))
  
  # Remove stopwords
  words <- words[!words %in% sw]
  
  # Recombine into cleaned text
  text <- paste(words, collapse = " ")
  
  return(text)
}

# Applies text cleaning function on data

Data <- Data %>%
  mutate(Rel_Info_clean = sapply(Rel_Info, clean_rel_info))

# Lemmatization, reduces words on their dictionary form
Data <- Data %>%
  mutate(Rel_Info_lemma = sapply(Rel_Info_clean, function(x) {
    paste(lemmatize_words(strsplit(x, "\\s+")[[1]]), collapse = " ")
  }))
write_xlsx(
  Data,
  path = file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Data", # For reproduction, specify your path
                   "Data combined.xlsx")
)
library(tidytext)
library(topicmodels)
library(broom)

# Ad an unique document identification number for later merge
Data <- Data %>%
  mutate(doc_id = row_number())

# Tokanisation: text data gets transformed to single wordbased tokens for statistical analysis and to create document-term-matrix
Tokens <- Data %>%
  select(doc_id, Rel_Info_lemma) %>%
  unnest_tokens(word, Rel_Info_lemma, token = "words") %>%
  filter(!is.na(word), word != "")

# Average number of tokens per document 
# Removes stopwords (again), however, due to lemmatization new stopwords can occur
sw <- unique(c(stopwords("en"))) 
Tokens <- Tokens %>%
  filter(!word %in% sw) %>%
  filter(nchar(word) >= 2)

# Count of word occurence in every document: 40,167 unique words analysed out of 70,699 words (Tokens)
Doc_term_counts <- Tokens %>%
  count(doc_id, word, sort = FALSE)

# Average number of tokens per document 
Av_words_doc <- nrow(Tokens)/ nrow(Data)
Av_words_doc
## [1] 295.5136
# With this code we can set a lower bound for word occurence
# doc_freq <- Doc_term_counts %>% group_by(word) %>% summarise(df = n_distinct(doc_id))
# keep_words <- doc_freq %>% filter(df >= 2) %>% pull(word)
# Doc_term_counts <- Doc_term_counts %>% filter(word %in% keep_words)

# Document-term-matrix
Dtm <- Doc_term_counts %>%
  cast_dtm(document = doc_id, term = word, value = n)

# How many unique words per document (Omitts words occurring multiple times)
Av_unique_words_doc <- nrow(Doc_term_counts)/ nrow(Data)
Av_unique_words_doc
## [1] 167.5545
rm(sw)

# if (nrow(dtm) == 0 || ncol(dtm) == 0) {
# stop("Die DTM ist leer. Prüfe deine Cleaning-/Stopwort-Schritte oder die Eingabedaten.")}
# Count how many documents each word appears in
Top_30_words <- Tokens %>%
  group_by(word) %>%
  summarise(doc_count = n_distinct(doc_id)) %>%
  ungroup() %>%
  mutate(
    total_docs = n_distinct(Tokens$doc_id),
    share = doc_count / total_docs
  ) %>%
  arrange(desc(doc_count)) %>%
  slice_head(n = 30)

Top_30_words
## # A tibble: 30 × 4
##    word        doc_count total_docs share
##    <chr>           <int>      <int> <dbl>
##  1 university        189        220 0.859
##  2 technology        188        220 0.855
##  3 experience        181        220 0.823
##  4 work              177        220 0.805
##  5 development       174        220 0.791
##  6 research          173        220 0.786
##  7 transfer          173        220 0.786
##  8 include           166        220 0.755
##  9 management        166        220 0.755
## 10 manage            165        220 0.75 
## # ℹ 20 more rows
rm(Top_30_words)
library(topicmodels)
library(textmineR)
## Loading required package: Matrix
## 
## Attaching package: 'textmineR'
## The following object is masked from 'package:Matrix':
## 
##     update
## The following object is masked from 'package:topicmodels':
## 
##     posterior
## The following object is masked from 'package:stats':
## 
##     update
library(purrr)
library(dplyr)
library(tibble)
library(Matrix)
library(future)
library(furrr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## Attaching package: 'tm'
## The following object is masked from 'package:koRpus':
## 
##     readTagged
## The following object is masked from 'package:stopwords':
## 
##     stopwords
library(httr)
## 
## Attaching package: 'httr'
## The following object is masked from 'package:NLP':
## 
##     content
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
set.seed(123)

#Converts Dtm Matrix sparse matrix
to_sparse <- function(dtm_tm) {
  Matrix::sparseMatrix(
    i = dtm_tm$i,              # Row indices (documents) of non-zero entries
    j = dtm_tm$j,              # Column indices (terms) of non-zero entries
    x = dtm_tm$v,              # Values (term counts)
    dims = c(dtm_tm$nrow, dtm_tm$ncol),
    dimnames = list(
      Docs(dtm_tm),            # Document names
      Terms(dtm_tm)            # Term names
    )
  )
}

# Compute mean topic coherence for one model on given docs 
coherence_for_model <- function(lda_model, dtm_sparse, M) { 
  # Topic-term matrix P(term | topic), Probability that a word is assigned to a certain topic
  phi <- topicmodels::posterior(lda_model)$terms
  
  # Overlap of terms in model and in the (sub)set of documents, 
  common <- intersect(colnames(phi), colnames(dtm_sparse))
  if (length(common) < 5) return(NA_real_)  # Skip folds with less then 5 overlapping terms (10 possible pairs), no meaningful coherence calculation (Röder et al. 2015)
  
  phi_use <- phi[, common, drop = FALSE]
  dtm_use <- dtm_sparse[, common, drop = FALSE]
  
  # Calculates coherence value for every topic and a mean across all topics
  mean(
    textmineR::CalcProbCoherence(phi = phi_use, dtm = dtm_use, M = M), # M = number of top ranked words used for coherence calculation, otherwise calculation would be to noisy and to time consuming (Röder et al., 2015; Mimno et al., 2011)
    na.rm = TRUE
  )
}

# Main function: tune k with 5-fold CV, then tune M 
tune_k_then_M <- function(dtm_tm,
                          k_values = 2:15,
                          M_for_k = 10,
                          folds_k = 5,
                          seed = 123,
                          M_values = 5:20) {

  set.seed(seed)

  # convert once: used for all coherence computations
  dtm_sparse <- to_sparse(dtm_tm)

  # wichtig für textmineR (verhindert u.a. "p nicht gefunden")
  dtm_sparse <- methods::as(dtm_sparse, "dgCMatrix")

  # Build folds over documents
  n_docs  <- nrow(dtm_sparse)
  fold_id <- sample(rep(1:folds_k, length.out = n_docs))

  # Parallel plan (optional: vorherigen Plan merken und danach zurücksetzen)
  old_plan <- future::plan()
  on.exit(future::plan(old_plan), add = TRUE)
  future::plan(future::multisession, workers = max(1, parallel::detectCores() - 1))

  # Tune k
  res_k <- furrr::future_map_dfr(
    k_values,
    function(k) {

      ctrl <- list(
        seed = seed + k,
        alpha = 50 / k,
        estimate.alpha = TRUE
      )

      lda_k <- topicmodels::LDA(dtm_tm, k = k, method = "VEM", control = ctrl)

      scores <- purrr::map_dbl(1:folds_k, function(f) {
        idx <- which(fold_id == f)
        if (length(idx) < 5) return(NA_real_)

        coherence_for_model(
          lda_k,
          dtm_sparse[idx, , drop = FALSE],
          M = M_for_k
        )
      })

      tibble::tibble(
        k          = k,
        mean_coh   = mean(scores, na.rm = TRUE),
        sd_coh     = sd(scores,   na.rm = TRUE),
        folds_used = sum(is.finite(scores))
      )
    },
    .options = furrr::furrr_options(seed = seed)
  )

  # Choose best k (smaller k wins ties)
  best_k <- res_k |>
    dplyr::filter(is.finite(mean_coh)) |>
    dplyr::arrange(dplyr::desc(mean_coh), k) |>
    dplyr::slice(1) |>
    dplyr::pull(k)

  # Fit final model with best_k
  best_ctrl <- list(
    seed  = seed + best_k,
    alpha = 50 / best_k
  )

  best_model <- topicmodels::LDA(
    dtm_tm,
    k = best_k,
    method = "VEM",
    control = best_ctrl
  )

  # Tune M for chosen model
  res_M <- tibble::tibble(
    M = M_values,
    mean_coh = purrr::map_dbl(
      M_values,
      ~ coherence_for_model(best_model, dtm_sparse, M = .x)
    )
  ) |>
    dplyr::arrange(dplyr::desc(mean_coh), M)

  best_M <- res_M$M[1]

  list(
    best_k     = best_k,
    best_M     = best_M,
    k_results  = res_k,
    M_results  = res_M,
    best_model = best_model
  )
}
  

# Applies function, calculation of optimal k and M
set.seed(123)
Out <- tune_k_then_M(
  dtm_tm   = Dtm,
  k_values = 2:15,
  M_for_k  = 10,
  folds_k  = 5,
  M_values = 5:20
)

opt_k <- Out$best_k
opt_M <- Out$best_M
head(Out$k_results)
## # A tibble: 6 × 4
##       k mean_coh  sd_coh folds_used
##   <int>    <dbl>   <dbl>      <int>
## 1     2   0.0469 0.0103           5
## 2     3   0.0433 0.00857          5
## 3     4   0.0407 0.00823          5
## 4     5   0.0453 0.00968          5
## 5     6   0.0467 0.00971          5
## 6     7   0.0461 0.00726          5
head(Out$M_results)
## # A tibble: 6 × 2
##       M mean_coh
##   <int>    <dbl>
## 1     6   0.0599
## 2     8   0.0590
## 3     7   0.0580
## 4     9   0.0570
## 5    10   0.0567
## 6     5   0.0557
opt_k
## [1] 11
opt_M
## [1] 6
# Optimal topic model with 10 topics (k) and 9 words per topic (n)
Lda_model_opt_k_M <- LDA(Dtm, k = opt_k, control = list(seed = 123))
set.seed(123)

gamma_mat <- topicmodels::posterior(Lda_model_opt_k_M)$topics
dim(gamma_mat)   # should be rowsum(data) x opt_k
## [1] 220  11
doc_ids <- Docs(Dtm)  

DocTopics_long <- tidy(Lda_model_opt_k_M, matrix = "gamma")

DocTopics_long <- DocTopics_long %>%
  mutate(
    doc_index = as.integer(document),    
    document  = doc_ids[doc_index]       
  ) %>%
  select(document, topic, gamma)

DocTopics_wide <- as_tibble(gamma_mat) %>%
  mutate(document = doc_ids) %>%
  relocate(document) %>%
  rename_with(
    ~ paste0("topic_", seq_along(.)),   
    .cols = -document
  )

# Check rowsum, has to be 1
DocTopics_wide <- DocTopics_wide %>%
  mutate(rowsum = rowSums(dplyr::select(., starts_with("topic_"))))

summary(DocTopics_wide$rowsum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
head(DocTopics_wide)
## # A tibble: 6 × 13
##   document  topic_1   topic_2   topic_3  topic_4 topic_5 topic_6 topic_7 topic_8
##   <chr>       <dbl>     <dbl>     <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 1        0.000997 0.000997  0.000997  0.000997 9.97e-4 9.97e-4 9.97e-4 9.97e-4
## 2 2        0.000297 0.000297  0.000297  0.000297 2.97e-4 2.97e-4 2.97e-4 2.97e-4
## 3 3        0.000108 0.000108  0.211     0.000108 3.63e-1 1.08e-4 1.08e-4 1.08e-4
## 4 4        0.885    0.0000680 0.0000680 0.114    6.80e-5 6.80e-5 6.80e-5 6.80e-5
## 5 5        0.000181 0.000181  0.0536    0.000181 1.37e-1 1.81e-4 1.81e-4 1.81e-4
## 6 6        0.000128 0.000128  0.000128  0.999    1.28e-4 1.28e-4 1.28e-4 1.28e-4
## # ℹ 4 more variables: topic_9 <dbl>, topic_10 <dbl>, topic_11 <dbl>,
## #   rowsum <dbl>
topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_mean_long <- topic_share_corpus %>%
  pivot_longer(
    everything(),
    names_to  = "topic",
    values_to = "mean_prop"
  ) %>%
  mutate(
    topic_num   = as.integer(sub("topic_", "", topic)),  
    mean_percent = mean_prop * 100                       
  ) %>%
  arrange(topic_num)

ggplot(topic_mean_long,
       aes(x = factor(topic_num), y = mean_percent)) +
  geom_col() +
  labs(
    title = "Average topic distribution across documents for Lda_model_opt_k_M",
    x     = "Topic",
    y     = "Share (%) (P(Ttopic | Document))"
  )

ggsave(
  filename = "Average_topic_distribution_across_documents_Lda_model_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

rm(topic_mean_long)
rm(topic_share_corpus)
pdf(file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", "Topic_distribution_per_document.pdf"), width = 8, height = 5)

for (doc_id in doc_ids) {
  
  doc_topics <- DocTopics_long %>%
    filter(document == doc_id) %>%
    mutate(
      topic_num = as.integer(gsub("[^0-9]", "", as.character(topic))),
      topic_num = factor(topic_num, levels = 1:12)
    )
  
p <- ggplot(doc_topics, aes(x = topic_num, y = gamma * 100)) +
  geom_col() +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = paste("Topic distribution per document for Lda_model_opt_k_M", doc_id),
    x     = "Topic",
    y     = "Share (%) (P(Topic | Document))"
    ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p)

}

dev.off() 
## quartz_off_screen 
##                 2
rm(p)
rm(doc_topics)
rm(doc_ids)
# Result of topic modeling with 10 topics (k) and 11 words each topic (M), (beta = P(term|topic))
Topics <- tidy(Lda_model_opt_k_M, matrix = "beta") %>%
  group_by(topic) %>%
  slice_max(beta, n = opt_M, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(topic, desc(beta))

Topics_opt_k_M_wide <- Topics %>%
  group_by(topic) %>%
  mutate(rank = row_number()) %>%
  select(topic, rank, term) %>%
  pivot_wider(
    names_from = rank,
    values_from = term,
    names_prefix = "term_"
  ) %>%
  ungroup()

# Result export as PDF
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)

topics_path <- "/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Topics"

pdf(file = file.path(topics_path, "Topics_opt_k_M_wide.pdf"), width = 11.7, height = 8.3)
grid.table(Topics_opt_k_M_wide)
grid.text(
  label = "LDA with optimal number of topics (10) and topwords (11)",
  x = 0.5, y = 0.95,
  gp = gpar(fontsize = 16, fontface = "bold")
)

dev.off()
## quartz_off_screen 
##                 2
set.seed(123)

dtm_sparse <- Matrix::sparseMatrix(
  i = Dtm$i, j = Dtm$j, x = Dtm$v,
  dims = c(Dtm$nrow, Dtm$ncol),
  dimnames = list(Docs(Dtm), Terms(Dtm))
)

# Probability that a word is assigned to a certain topic
phi <- topicmodels::posterior(Lda_model_opt_k_M)$terms

# Same order of terms
common_terms <- intersect(colnames(phi), colnames(dtm_sparse))
stopifnot(length(common_terms) >= 6) # Min 6 overlapping terms for meaningful coherence score calculation
phi <- phi[, common_terms, drop = FALSE]
dtm_sparse <- dtm_sparse[, common_terms, drop = FALSE]

# Calculates coherence for optimal number of topwords per topic: n (= M) = opt_n 
coherence_vec <- textmineR::CalcProbCoherence(phi = phi, dtm = dtm_sparse, M = opt_M)

# Mean across all topics
mean_coherence_opt_k_n <- mean(coherence_vec, na.rm = TRUE)
coherence_vec   # Coherence pro Topic
##          1          2          3          4          5          6          7 
## 0.03772073 0.11932845 0.06777237 0.03505229 0.10571962 0.01129753 0.04182824 
##          8          9         10         11 
## 0.04473407 0.05224415 0.06467550 0.06220013
mean_coherence_opt_k_n  # Durchschnittlicher Coherence Score
## [1] 0.05841573
k_results <- Out$k_results

ggplot(k_results, aes(x = k, y = mean_coh)) +
  geom_line() +
  geom_point() +
  geom_errorbar(
    aes(ymin = mean_coh - sd_coh,
        ymax = mean_coh + sd_coh),
    width = 0.2
  ) +
  labs(
    x = "Number of topics (k)",
    y = "Mean probabilistic coherence",
    title = "Average probabilistic cohrence_lda_model_opt_k_M"
  ) +
  theme_minimal()

ggsave(
  filename = "Average_probabilistic_cohrence_lda_model_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
set.seed(123)

# Defines Palmetto function
get_palmetto_cv <- function(topic_terms, coherence = "cv") {
  base_url <- "http://palmetto.aksw.org/palmetto-webapp/service"
  words_param <- utils::URLencode(paste(topic_terms, collapse = " "), reserved = TRUE)
  url <- paste0(base_url, "/", coherence, "?words=", words_param)
  resp <- httr::GET(url)
  httr::stop_for_status(resp)
  
  resp_text <- httr::content(resp, "text", encoding = "UTF-8")
  as.numeric(resp_text)
}

cv_for_model <- function(lda_model, M = opt_M, coherence = "cv") {
 
   # Topic-term matrix
  phi <- topicmodels::posterior(lda_model)$terms
  
  # Top M terms per topic (rows = terms, cols = topics)
  top_terms <- apply(phi, 1, function(p) {
    names(sort(p, decreasing = TRUE)[1:M])
  })
  
  # If there is only 1 topic, ensure top_terms is a matrix
  if (is.null(dim(top_terms))) {
    top_terms <- matrix(top_terms, ncol = 1)
  }
  
  K <- ncol(top_terms)
  topic_cv <- numeric(K)
  
  for (k in seq_len(K)) {
    topic_cv[k] <- get_palmetto_cv(top_terms[, k], coherence = coherence)
  }
  
  list(
    topic_cv = topic_cv,
    mean_cv  = mean(topic_cv, na.rm = TRUE)
  )
}

M <- ifelse(opt_M <= 10, opt_M, 10) # Restriction that Palmetto function just works for max 10 topwords
cv_opt_k_M <- cv_for_model(Lda_model_opt_k_M, M = M, coherence = "cv")

cv_opt_k_M_per_topic <- cv_opt_k_M$topic_cv  
cv_opt_k_M_per_topic
##  [1] 0.6310625 0.6317924 0.5273775 0.6412863 0.6146113 0.6034484 0.6570182
##  [8] 0.5342637 0.5880477 0.7477447 0.5180718
cv_av_opt_k_M <- cv_opt_k_M$mean_cv    
cv_av_opt_k_M
## [1] 0.6086113
set.seed(123)

library(dplyr)
library(tidytext)
library(stringr)

# Remove terms appearing in >80% of docs (treat as domain stopwords)
doc_total <- n_distinct(Tokens$doc_id)
hi_freq <- Tokens %>%
  group_by(word) %>%
  summarise(df = n_distinct(doc_id), .groups = "drop") %>%
  filter(df > 0.80 * doc_total) %>%
  pull(word)

# Removes "tlo" (technology licence officer), the word disturbs the topic model
domain_stops <- unique(c(hi_freq, "tlo"))

Tokens_filt <- Tokens %>%
  filter(!str_to_lower(word) %in% domain_stops)

# Build DTM and fit LDA (VEM by default)
Dtm_filt <- Tokens_filt %>%
  count(doc_id, word) %>%
  cast_dtm(doc_id, word, n)

# Remaining average word count per document
Av_words_doc_filt <- nrow(Tokens_filt) / nrow(Data)
Av_words_doc_filt
## [1] 281.0545
Doc_term_counts_filt <- Tokens_filt %>%
  count(doc_id, word, sort = FALSE)

Av_unique_words_doc_filt <- nrow(Doc_term_counts_filt) / nrow(Data)
Av_unique_words_doc_filt
## [1] 164.2
rm(hi_freq)
# Applies function, calculation of optimal k and n
set.seed(123)

Out <- tune_k_then_M(
  dtm_tm   = Dtm_filt,
  k_values = 2:15,
  M_for_k  = 10,
  folds_k  = 5,
  M_values = 5:20
)

opt_k_f <- Out$best_k
opt_M_f <- Out$best_M
head(Out$k_results)
## # A tibble: 6 × 4
##       k mean_coh  sd_coh folds_used
##   <int>    <dbl>   <dbl>      <int>
## 1     2   0.0709 0.0129           5
## 2     3   0.0689 0.00598          5
## 3     4   0.0747 0.00788          5
## 4     5   0.0681 0.00770          5
## 5     6   0.0626 0.00562          5
## 6     7   0.0739 0.00703          5
head(Out$M_results)
## # A tibble: 6 × 2
##       M mean_coh
##   <int>    <dbl>
## 1     7   0.0800
## 2     9   0.0797
## 3    10   0.0782
## 4    11   0.0779
## 5     6   0.0776
## 6     8   0.0776
opt_k_f
## [1] 14
opt_M_f
## [1] 7
rm(Doc_term_counts_filt)
# Optimal topic model with 3 topics (k) and 9 words per topic (n)
Lda_model_f_opt_k_M <- LDA(Dtm, k = opt_k_f, control = list(seed = 123))
set.seed(123)

gamma_mat <- topicmodels::posterior(Lda_model_f_opt_k_M)$topics
dim(gamma_mat)   # should be rowsum(data) x opt_k
## [1] 220  14
library(tibble)

doc_ids <- Docs(Dtm_filt)  

DocTopics_long <- tidy(Lda_model_f_opt_k_M, matrix = "gamma")

DocTopics_long <- DocTopics_long %>%
  mutate(
    doc_index = as.integer(document),    
    document  = doc_ids[doc_index]       
  ) %>%
  select(document, topic, gamma)

DocTopics_wide <- as_tibble(gamma_mat) %>%
  mutate(document = doc_ids) %>%
  relocate(document) %>%
  rename_with(
    ~ paste0("topic_", seq_along(.)),   
    .cols = -document
  )

# Check rowsum, has to be 1
DocTopics_wide <- DocTopics_wide %>%
  mutate(rowsum = rowSums(dplyr::select(., starts_with("topic_"))))

summary(DocTopics_wide$rowsum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
head(DocTopics_wide)
## # A tibble: 6 × 16
##   document   topic_1   topic_2   topic_3 topic_4 topic_5 topic_6 topic_7 topic_8
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 1        0.000763  0.000763  0.000763  7.63e-4 7.63e-4 7.63e-4 7.63e-4 7.63e-4
## 2 2        0.000227  0.000227  0.000227  2.27e-4 2.27e-4 2.27e-4 2.27e-4 2.27e-4
## 3 3        0.0000824 0.0000824 0.269     8.24e-5 1.05e-1 8.24e-5 8.24e-5 8.24e-5
## 4 4        0.999     0.0000520 0.0000520 5.20e-5 5.20e-5 5.20e-5 5.20e-5 5.20e-5
## 5 5        0.000139  0.000139  0.000139  1.39e-4 1.39e-4 6.34e-2 1.39e-4 1.39e-4
## 6 6        0.0000980 0.0000980 0.0000980 9.99e-1 9.80e-5 9.80e-5 9.80e-5 9.80e-5
## # ℹ 7 more variables: topic_9 <dbl>, topic_10 <dbl>, topic_11 <dbl>,
## #   topic_12 <dbl>, topic_13 <dbl>, topic_14 <dbl>, rowsum <dbl>
topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_mean_long <- topic_share_corpus %>%
  pivot_longer(
    everything(),
    names_to  = "topic",
    values_to = "mean_prop"
  ) %>%
  mutate(
    topic_num   = as.integer(sub("topic_", "", topic)),  
    mean_percent = mean_prop * 100                       
  ) %>%
  arrange(topic_num)

ggplot(topic_mean_long,
       aes(x = factor(topic_num), y = mean_percent)) +
  geom_col() +
  labs(
    title = "Average topic distribution across documents n\for Lda_model_f_opt_k_M",
    x     = "Topic",
    y     = "Share (%) (P(Ttopic | Document))"
  )

ggsave(
  filename = "Average_topic_distribution_across_documents_Lda_model_f_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

rm(topic_mean_long)
rm(topic_share_corpus)
pdf(file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", "Topic_distribution_per_document_filtered_Dtm.pdf"), width = 8, height = 5) # For reproduction, specify your path

for (doc_id in doc_ids) {
  
  doc_topics <- DocTopics_long %>%
    filter(document == doc_id) %>%
    mutate(
      topic_num = as.integer(gsub("[^0-9]", "", as.character(topic))),
      topic_num = factor(topic_num, levels = 1:12)
    )
  
p <- ggplot(doc_topics, aes(x = topic_num, y = gamma * 100)) +
  geom_col() +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = paste("Topic distribution per document for Lda_model_f_opt_k_M", doc_id),
    x     = "Topic",
    y     = "Share (%) (P(Topic | Document))"
    ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p)

}

dev.off() 
## quartz_off_screen 
##                 2
rm(p)
rm(doc_topics)
rm(doc_ids)
# Result of topic modeling with optimal number of topics and words per topic, (beta = P(term|topic))
Topics <- tidy(Lda_model_f_opt_k_M, matrix = "beta") %>%
  group_by(topic) %>%
  slice_max(beta, n = opt_M_f, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(topic, desc(beta))

Topics_f_opt_k_M_wide <- Topics %>%
  group_by(topic) %>%
  mutate(rank = row_number()) %>%
  select(topic, rank, term) %>%
  pivot_wider(
    names_from = rank,
    values_from = term,
    names_prefix = "term_"
  ) %>%
  ungroup()

pdf(file = file.path(topics_path, "Topics_opt_k_M_f_wide.pdf"), width = 11.7, height = 8.3)
grid.table(Topics_f_opt_k_M_wide)
grid.text(
  label = "LDA based on filtered Dtm with coherence score maximizing k (9) and M (11) ",
  x = 0.5, y = 0.95,
  gp = gpar(fontsize = 16, fontface = "bold")
)

dev.off()
## quartz_off_screen 
##                 2
set.seed(123)

dtm_sparse <- Matrix::sparseMatrix(
  i = Dtm_filt$i, j = Dtm_filt$j, x = Dtm_filt$v,
  dims = c(Dtm_filt$nrow, Dtm_filt$ncol),
  dimnames = list(Docs(Dtm_filt), Terms(Dtm_filt))
)

# Probability that a word is assigned to a certain topic
phi <- topicmodels::posterior(Lda_model_f_opt_k_M)$terms

# Same order of terms
common_terms <- intersect(colnames(phi), colnames(dtm_sparse))
stopifnot(length(common_terms) >= 6) # Min 6 overlapping terms for meaningful coherence score calculation
phi <- phi[, common_terms, drop = FALSE]
dtm_sparse <- dtm_sparse[, common_terms, drop = FALSE]

# Calculates coherence for optimal number of top words per topic: M (= M) = opt_M_f
coherence_vec <- textmineR::CalcProbCoherence(phi = phi, dtm = dtm_sparse, M = opt_M_f)

# Mean across all topics
mean_coherence_f_opt_k_M <- mean(coherence_vec, na.rm = TRUE)
coherence_vec   # Coherence per topic
##          1          2          3          4          5          6          7 
## 0.06755852 0.07654874 0.08549464 0.05248502 0.13993629 0.03402538 0.07458026 
##          8          9         10         11         12         13         14 
## 0.03578427 0.02979074 0.06289999 0.07940357 0.08203238 0.03444631 0.14182498
mean_coherence_f_opt_k_M  # Average coherence score
## [1] 0.07120079
k_results <- Out$k_results

ggplot(k_results, aes(x = k, y = mean_coh)) +
  geom_line() +
  geom_point() +
  geom_errorbar(
    aes(ymin = mean_coh - sd_coh,
        ymax = mean_coh + sd_coh),
    width = 0.2
  ) +
  labs(
    x = "Number of topics (k)",
    y = "Mean probabilistic coherence",
    title = "Average probabilistic cohrence lda_model_f_opt_k_M.jpg"
  ) +
  theme_minimal()

ggsave(
  filename = "Average_probabilistic_cohrence_lda_model_f_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
M <- ifelse(opt_M_f <= 10, opt_M_f, 10) 

cv_f_opt_k_M <- cv_for_model(Lda_model_f_opt_k_M, M = 10, coherence = "cv")

cv_f_opt_k_M_per_topic <- cv_f_opt_k_M$topic_cv   # c_v per topic
cv_f_opt_k_M_per_topic
##  [1] 0.6068744 0.5449560 0.5272105 0.5587452 0.5392415 0.4577723 0.5047650
##  [8] 0.5340820 0.5621948 0.6689099 0.4297619 0.4770009 0.5227212 0.5359144
cv_av_f_opt_k_M <- cv_f_opt_k_M$mean_cv # mean c_v across topics
cv_av_f_opt_k_M
## [1] 0.5335821
set.seed(123)

# Word ounts per document-term (just counting)
Doc_term_counts_tf <- Tokens_filt %>%
  count(doc_id, word, name = "n")

# Compute tf-idf per document-term
Doc_term_tf <- Doc_term_counts_tf %>%
  bind_tf_idf(term = word, document = doc_id, n = n)

# Aggregate tf-idf per term to decide which words to keep
word_stats_tf <- Doc_term_tf %>%
  group_by(word) %>%
  summarise(
    mean_tf = mean(tf_idf),
    max_tf  = max(tf_idf),
    .groups = "drop"
  )

# Choose a threshold (example: drop bottom 10% by mean tf-idf). Adjustable (e.g. 0.05)
threshold_tf <- quantile(word_stats_tf$mean_tf, 
                         probs = 0.10, na.rm = TRUE)

vocab_tf <- word_stats_tf %>%
  filter(mean_tf >= threshold_tf) %>%
  pull(word)

Doc_term_counts_filt_tf <- Doc_term_counts_tf %>%
  filter(word %in% vocab_tf)

# DTM with tf-idf used for running the topic model
Dtm_filt_tf <- Doc_term_counts_filt_tf %>%
  cast_dtm(document = doc_id,
           term     = word,
           value    = n)

# Tuning of k and M 
set.seed(123)

Out <- tune_k_then_M(
  dtm_tm   = Dtm_filt_tf,
  k_values = 2:15,
  M_for_k  = 10,
  folds_k  = 5,
  M_values = 5:20
)

opt_k_f_tf <- Out$best_k
opt_M_f_tf <- Out$best_M
head(Out$k_results)
## # A tibble: 6 × 4
##       k mean_coh  sd_coh folds_used
##   <int>    <dbl>   <dbl>      <int>
## 1     2   0.0695 0.0130           5
## 2     3   0.0662 0.00591          5
## 3     4   0.0857 0.00713          5
## 4     5   0.0853 0.0151           5
## 5     6   0.0849 0.0143           5
## 6     7   0.104  0.0169           5
head(Out$M_results)
## # A tibble: 6 × 2
##       M mean_coh
##   <int>    <dbl>
## 1     6    0.138
## 2     5    0.131
## 3     7    0.127
## 4     9    0.127
## 5    10    0.127
## 6     8    0.127
opt_k_f_tf
## [1] 15
opt_M_f_tf
## [1] 6
rm(Doc_term_counts_filt_tf)
rm(threshold_tf)
rm(word_stats_tf)
Lda_model_f_tf_opt_k_M <- LDA(Dtm_filt_tf, k = opt_k_f_tf, control = list(seed = 123))
set.seed(123)

gamma_mat <- topicmodels::posterior(Lda_model_f_tf_opt_k_M)$topics
dim(gamma_mat)   # should be rowsum(data) x opt_k
## [1] 220  15
library(tibble)

doc_ids <- Docs(Dtm_filt_tf)  

DocTopics_long <- tidy(Lda_model_f_tf_opt_k_M, matrix = "gamma")

DocTopics_long <- DocTopics_long %>%
  mutate(
    doc_index = as.integer(document),    
    document  = doc_ids[doc_index]       
  ) %>%
  select(document, topic, gamma)

DocTopics_wide <- as_tibble(gamma_mat) %>%
  mutate(document = doc_ids) %>%
  relocate(document) %>%
  rename_with(
    ~ paste0("topic_", seq_along(.)),   
    .cols = -document
  )

# Check rowsum, has to be 1
DocTopics_wide <- DocTopics_wide %>%
  mutate(rowsum = rowSums(dplyr::select(., starts_with("topic_"))))

DocTopics_wide_tf <- DocTopics_wide

summary(DocTopics_wide$rowsum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
head(DocTopics_wide)
## # A tibble: 6 × 17
##   document   topic_1   topic_2   topic_3 topic_4 topic_5 topic_6 topic_7 topic_8
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 1        0.00140   0.00140   0.00140   4.93e-1 1.40e-3 1.40e-3 1.40e-3 1.40e-3
## 2 2        0.000449  0.000449  0.336     2.50e-1 4.49e-4 4.49e-4 4.49e-4 4.49e-4
## 3 3        0.000126  0.000126  0.000126  1.26e-4 1.26e-4 1.26e-4 1.26e-4 1.26e-4
## 4 4        0.0000837 0.0000837 0.0000837 9.99e-1 8.37e-5 8.37e-5 8.37e-5 8.37e-5
## 5 5        0.000300  0.000300  0.000300  3.00e-4 3.00e-4 3.00e-4 7.65e-1 3.00e-4
## 6 6        0.000132  0.998     0.000132  1.32e-4 1.32e-4 1.32e-4 1.32e-4 1.32e-4
## # ℹ 8 more variables: topic_9 <dbl>, topic_10 <dbl>, topic_11 <dbl>,
## #   topic_12 <dbl>, topic_13 <dbl>, topic_14 <dbl>, topic_15 <dbl>,
## #   rowsum <dbl>
topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_mean_long <- topic_share_corpus %>%
  pivot_longer(
    everything(),
    names_to  = "topic",
    values_to = "mean_prop"
  ) %>%
  mutate(
    topic_num   = as.integer(sub("topic_", "", topic)),  
    mean_percent = mean_prop * 100                       
  ) %>%
  arrange(topic_num)

ggplot(topic_mean_long,
       aes(x = factor(topic_num), y = mean_percent)) +
  geom_col() +
  labs(
    title = "Average topic distribution across documents \nfor Lda_model_f_tf_opt_k_M",
    x     = "Topic",
    y     = "Share (%) (P(Ttopic | Document))"
  )

ggsave(
  filename = "Average_topic_distribution_across_documents_Lda_model_f_tf_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

rm(topic_mean_long)
rm(topic_share_corpus)
pdf(file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", "Topic_distribution_per_document_filtered_tf_Dtm.pdf"), width = 8, height = 5) # For reproduction, specify your path

for (doc_id in doc_ids) {
  
  doc_topics <- DocTopics_long %>%
    filter(document == doc_id) %>%
    mutate(
      topic_num = as.integer(gsub("[^0-9]", "", as.character(topic))),
      topic_num = factor(topic_num, levels = 1:12)
    )
  
p <- ggplot(doc_topics, aes(x = topic_num, y = gamma * 100)) +
  geom_col() +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = paste("Topic distribution per document for Lda_model_f_tf_opt_k_M", doc_id),
    x     = "Topic",
    y     = "Share (%) (P(Topic | Document))"
    ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p)

}

dev.off() 
## quartz_off_screen 
##                 2
rm(p)
rm(doc_topics)
rm(doc_ids)

# Result of topic modeling with optimal number of topics and words per topic, (beta = P(term|topic))
Topics <- tidy(Lda_model_f_tf_opt_k_M, matrix = "beta") %>%
  group_by(topic) %>%
  slice_max(beta, n = opt_M_f_tf, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(topic, desc(beta))

Topics_f_tf_opt_k_M_wide <- Topics %>%
  group_by(topic) %>%
  mutate(rank = row_number()) %>%
  select(topic, rank, term) %>%
  pivot_wider(
    names_from = rank,
    values_from = term,
    names_prefix = "term_"
  ) %>%
  ungroup()

pdf(file = file.path(topics_path, "Topics_10_5_f_tf_wide.pdf"), width = 11.7, height = 8.3)
grid.table(Topics_f_tf_opt_k_M_wide)
grid.text(
  label = "LDA based on filtered Dtm and tf-idf with k (3) and M (9)",
  x = 0.5, y = 0.95,
  gp = gpar(fontsize = 16, fontface = "bold")
)

dev.off()
## quartz_off_screen 
##                 2
set.seed(123)

dtm_sparse <- Matrix::sparseMatrix(
  i = Dtm_filt_tf$i, j = Dtm_filt_tf$j, x = Dtm_filt_tf$v,
  dims = c(Dtm_filt_tf$nrow, Dtm_filt_tf$ncol),
  dimnames = list(Docs(Dtm_filt_tf), Terms(Dtm_filt_tf))
)

# Probability that a word is assigned to a certain topic
phi <- topicmodels::posterior(Lda_model_f_tf_opt_k_M)$terms

# Same order of terms
common_terms <- intersect(colnames(phi), colnames(dtm_sparse))
stopifnot(length(common_terms) >= 6) # Min 6 overlapping terms for meaningful coherence score calculation
phi <- phi[, common_terms, drop = FALSE]
dtm_sparse <- dtm_sparse[, common_terms, drop = FALSE]

# Calculates coherence for optimal number of top words per topic
coherence_vec <- textmineR::CalcProbCoherence(phi = phi, dtm = dtm_sparse, M = opt_M_f_tf)

# Mean across all topics
mean_coherence_f_tf_opt_k_M <- mean(coherence_vec, na.rm = TRUE)
coherence_vec   # Coherence per topic
##            1            2            3            4            5            6 
##  0.108527497  0.110026655  0.077848325  0.166609097  0.096260054  0.140853764 
##            7            8            9           10           11           12 
##  0.112748243 -0.007303847  0.138601230  0.146202588  0.078396053  0.036921173 
##           13           14           15 
##  0.307623292  0.128522259  0.056638910
mean_coherence_f_tf_opt_k_M  # Average coherence score
## [1] 0.1132317
k_results <- Out$k_results

ggplot(k_results, aes(x = k, y = mean_coh)) +
  geom_line() +
  geom_point() +
  geom_errorbar(
    aes(ymin = mean_coh - sd_coh,
        ymax = mean_coh + sd_coh),
    width = 0.2
  ) +
  labs(
    x = "Number of topics (k)",
    y = "Mean probabilistic coherence",
    # title = "Average probabilistic cohrence lda_model_f_tf_opt_k_M"
  ) +
  theme_minimal() +
  theme(
    axis.text  = element_text(size = 12),  
    axis.title = element_text(size = 12),
    plot.caption = element_text(size = 12, hjust = 0, lineheight = 1.5, margin = margin(t=10)) # more top margin for caption
)

ggsave(
  filename = "Average_probabilistic_cohrence_lda_model_f_tf_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
M <- ifelse(opt_M <= 10, opt_M_f_tf, 10) # Restriction that Palmetto function just works for max 10 topwords

cv_f_tf_opt_k_M <- cv_for_model(Lda_model_f_tf_opt_k_M, M = M, coherence = "cv")

cv_f_tf_opt_k_M_per_topic <- cv_f_tf_opt_k_M$topic_cv   # c_v per topic
cv_f_tf_opt_k_M_per_topic
##  [1] 0.4371384 0.4540070 0.3924674 0.3598332 0.5111525 0.5691882 0.5080738
##  [8] 0.2763191 0.5250556 0.5504341 0.3284679 0.3632105 0.7309534 0.5309512
## [15] 0.5603276
cv_av_f_tf_opt_k_M <- cv_f_tf_opt_k_M$mean_cv # mean c_v across topics
cv_av_f_tf_opt_k_M
## [1] 0.473172
Lda_model_f_tf_6_10 <- LDA(Dtm_filt_tf, k = 6, control = list(seed = 123))

set.seed(123)

gamma_mat <- topicmodels::posterior(Lda_model_f_tf_6_10)$topics
dim(gamma_mat)   # should be rowsum(data) x opt_k
## [1] 220   6
doc_ids <- Docs(Dtm_filt_tf)  

DocTopics_long_tf_k6 <- tidy(Lda_model_f_tf_6_10, matrix = "gamma")

DocTopics_long_tf_k6 <- DocTopics_long_tf_k6 %>%
  mutate(
    doc_index = as.integer(document),    
    document  = doc_ids[doc_index]       
  ) %>%
  select(document, topic, gamma)

DocTopics_wide <- as_tibble(gamma_mat) %>%
  mutate(document = doc_ids) %>%
  relocate(document) %>%
  rename_with(
    ~ paste0("topic_", seq_along(.)),   
    .cols = -document
  )

DocTopics_wide_tf <- DocTopics_wide

# Check rowsum, has to be 1
DocTopics_wide <- DocTopics_wide %>%
  mutate(rowsum = rowSums(dplyr::select(., starts_with("topic_"))))

summary(DocTopics_wide_tf$rowsum)
## Warning: Unknown or uninitialised column: `rowsum`.
## Length  Class   Mode 
##      0   NULL   NULL
head(DocTopics_wide)
## # A tibble: 6 × 8
##   document  topic_1  topic_2  topic_3  topic_4  topic_5  topic_6 rowsum
##   <chr>       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>  <dbl>
## 1 1        0.00322  0.00322  0.319    0.668    0.00322  0.00322       1
## 2 2        0.00103  0.00103  0.658    0.338    0.00103  0.00103       1
## 3 3        0.586    0.000290 0.000290 0.000290 0.000290 0.412         1
## 4 4        0.000192 0.129    0.000192 0.871    0.000192 0.000192      1
## 5 5        0.000689 0.000689 0.000689 0.734    0.000689 0.264         1
## 6 6        0.332    0.667    0.000303 0.000303 0.000303 0.000303      1
Topics <- tidy(Lda_model_f_tf_6_10, matrix = "beta") %>%
  group_by(topic) %>%
  slice_max(beta, n = 10, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(topic, desc(beta))

Topics_f_tf_6_10_wide <- Topics %>%
  group_by(topic) %>%
  mutate(rank = row_number()) %>%
  select(topic, rank, term) %>%
  pivot_wider(
    names_from = rank,
    values_from = term,
    names_prefix = "term_"
  ) %>%
  ungroup()

pdf(file = file.path(topics_path, "Topics_f_tf_6_10_wide.pdf"), width = 11.7, height = 8.3)
grid.table(Topics_f_tf_6_10_wide)
grid.text(
  label = "LDA based on filtered Dtm and tf-idf with k (6) and M (10)",
  x = 0.5, y = 0.95,
  gp = gpar(fontsize = 16, fontface = "bold")
)

dev.off()
## quartz_off_screen 
##                 2
phi <- topicmodels::posterior(Lda_model_f_tf_6_10)$terms

# Same order of terms
common_terms <- intersect(colnames(phi), colnames(dtm_sparse))
stopifnot(length(common_terms) >= 6) # Min 6 overlapping terms for meaningful coherence score calculation
phi <- phi[, common_terms, drop = FALSE]
dtm_sparse <- dtm_sparse[, common_terms, drop = FALSE]

# Calculates coherence for optimal number of top words per topic
coherence_vec <- textmineR::CalcProbCoherence(phi = phi, dtm = dtm_sparse, M = 10)

# Mean across all topics
mean_coherence_f_tf_6_10 <- mean(coherence_vec, na.rm = TRUE)
coherence_vec   # Coherence per topic
##          1          2          3          4          5          6 
## 0.04920478 0.11071597 0.14342646 0.08151811 0.03738352 0.12368900
mean_coherence_f_tf_6_10  # Average coherence score
## [1] 0.09098964
cv_f_tf_6_10 <- cv_for_model(Lda_model_f_tf_6_10, M = 10, coherence = "cv")

cv_f_tf_6_10_per_topic <- cv_f_tf_6_10$topic_cv   # c_v per topic
cv_f_tf_6_10_per_topic
## [1] 0.5067316 0.3730815 0.5444911 0.4864700 0.4501460 0.4863338
cv_av_f_tf_6_10 <- cv_f_tf_6_10$mean_cv # mean c_v across topics
cv_av_f_tf_6_10
## [1] 0.4745424
topic_share_corpus_tf_k6 <- DocTopics_wide_tf %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_mean_long <- topic_share_corpus_tf_k6 %>%
  pivot_longer(
    everything(),
    names_to  = "topic",
    values_to = "mean_prop"
  ) %>%
  mutate(
    topic_num   = as.integer(sub("topic_", "", topic)),  
    mean_percent = mean_prop * 100                       
  ) %>%
  arrange(topic_num)

ggplot(topic_mean_long,
       aes(x = factor(topic_num), y = mean_percent)) +
  geom_col() +
  scale_y_continuous(limits = c(0, 30)) +
  labs(
    #title = "Average topic distribution across documents \nfor Lda_model_f_tf_6_10",
    x     = "Topic",
    y     = "Share (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text  = element_text(size = 30),  
    axis.title = element_text(size = 30),
    plot.caption = element_text(size = 30, hjust = 0, lineheight = 1.5, margin = margin(t=10)) # more top margin for caption
)

ggsave(
  filename = "Average_topic_distribution_across_documents_Lda_model_f_tf_6_10.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

doc_topics <- DocTopics_long %>%
  filter(document == 8) %>%
  mutate(topic = factor(topic, levels = 1:6))
  
ggplot(doc_topics, aes(x = topic, y = gamma * 100)) +
  geom_col() +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = paste("Topic distribution per document", doc_id),
    x     = "Topic",
    y     = "Share (%) (P(Topic | Document))"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave(
  filename = "Topic distribution for document 8 tfidf.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
doc_id <- 8
doc_topics <- tibble(
  document = doc_id,
  topic    = factor(1:6, levels = 1:6),
  gamma    = c(0.19, 0.42, 0.001, 0.001, 0.001, 0.39)
)

ggplot(doc_topics, aes(x = topic, y = gamma * 100)) +
  geom_col(width = 0.85) +
  scale_x_discrete(drop = FALSE) +
  scale_y_continuous(limits = c(0, 50)) +
  labs(
    x     = "Topic",
    y     = "Share (%)"
  ) +
  theme_minimal(base_size = 18) +
  theme(
    axis.text.x  = element_text(size = 30),
    axis.text.y  = element_text(size = 30),
    axis.title.x = element_text(size = 30),
    axis.title.y = element_text(size = 30),
    plot.title   = element_text(size = 30)
  )

ggsave(
  filename = "Average_topic_distribution_topic_xy.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
set.seed(123)

# Define penalty function
penalty_factor <- function(share, low = 0.60, high = 0.80,
                           min_mult = 0.30, shape = 2) {
  out <- ifelse(
    share <= low, 1,
    1 - (1 - min_mult) * ((share - low) / (high - low))^shape
  )
  pmax(out, min_mult)  # never below min_mult
}

# Basic counts and document frequency 
# total number of documents
doc_total <- n_distinct(Tokens_filt$doc_id)

# df per term and share across documents, plus multiplier
df_tbl_filt <- Tokens_filt %>%
  distinct(doc_id, word) %>%      # Presence/absence per document
  count(word, name = "df") %>%    # Share across documents
  mutate(
    share = df / doc_total,       # Document share
    mult  = penalty_factor(share) # Penalty multiplier
  )

# weighted counts per doc-term
Doc_term_counts_weighted <- Tokens_filt %>%
  count(doc_id, word, name = "n") %>%    # raw counts
  left_join(df_tbl_filt %>% select(word, mult), by = "word") %>%
  mutate(
    mult       = ifelse(is.na(mult), 1, mult),  
    n_weighted = n * mult  # Apply penalty
  )

# Make counts integer and drop zeros 
Doc_term_counts_weighted_int <- Doc_term_counts_weighted %>%
  mutate(
    n_weighted = round(n_weighted),                       # LDA needs ints
    n_weighted = ifelse(n_weighted < 0, 0, n_weighted)
  ) %>%
  filter(n_weighted > 0)

# Build weighted DTM 
Dtm_filt_weight <- Doc_term_counts_weighted_int %>%
  cast_dtm(document = doc_id,
           term     = word,
           value    = n_weighted)

rm(Doc_term_counts_weighted_int)
rm(Doc_term_counts_weighted)

rm(doc_id)
# Applies function, calculation of optimal k and M
set.seed(123)
Out <- tune_k_then_M(
  dtm_tm   = Dtm_filt_weight,
  k_values = 2:15,
  M_for_k  = 10,
  folds_k  = 5,
  M_values = 5:20
)

opt_k_f_w <- Out$best_k
opt_M_f_w <- Out$best_M
head(Out$k_results)
## # A tibble: 6 × 4
##       k mean_coh  sd_coh folds_used
##   <int>    <dbl>   <dbl>      <int>
## 1     2   0.0967 0.0169           5
## 2     3   0.0906 0.0121           5
## 3     4   0.0826 0.00685          5
## 4     5   0.0872 0.00537          5
## 5     6   0.0789 0.00725          5
## 6     7   0.0842 0.0146           5
head(Out$M_results)
## # A tibble: 6 × 2
##       M mean_coh
##   <int>    <dbl>
## 1     5   0.123 
## 2     6   0.119 
## 3     7   0.112 
## 4     8   0.108 
## 5     9   0.0996
## 6    10   0.0958
opt_k_f_w
## [1] 2
opt_M_f_w
## [1] 5
rm(df_tbl_filt)
Lda_model_f_w_opt_k_M <- LDA(Dtm_filt_weight, k = opt_k_f_w, control = list(seed = 123))
set.seed(123)

gamma_mat <- topicmodels::posterior(Lda_model_f_w_opt_k_M)$topics
dim(gamma_mat)   # should be rowsum(data) x opt_k
## [1] 220   2
doc_ids <- Docs(Dtm_filt_weight)  

DocTopics_long <- tidy(Lda_model_f_w_opt_k_M, matrix = "gamma")

DocTopics_long <- DocTopics_long %>%
  mutate(
    doc_index = as.integer(document),    
    document  = doc_ids[doc_index]       
  ) %>%
  select(document, topic, gamma)

DocTopics_wide <- as_tibble(gamma_mat) %>%
  mutate(document = doc_ids) %>%
  relocate(document) %>%
  rename_with(
    ~ paste0("topic_", seq_along(.)),   
    .cols = -document
  )

# Check rowsum, has to be 1
DocTopics_wide <- DocTopics_wide %>%
  mutate(rowsum = rowSums(dplyr::select(., starts_with("topic_"))))

summary(DocTopics_wide$rowsum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
head(DocTopics_wide)
## # A tibble: 6 × 4
##   document topic_1  topic_2 rowsum
##   <chr>      <dbl>    <dbl>  <dbl>
## 1 1          0.450 0.550         1
## 2 2          0.448 0.552         1
## 3 3          0.528 0.472         1
## 4 4          1.000 0.000446      1
## 5 5          0.747 0.253         1
## 6 6          0.941 0.0594        1
topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_share_corpus <- DocTopics_wide %>%
  select(starts_with("topic_")) %>%
  summarise(across(everything(), mean))

topic_mean_long <- topic_share_corpus %>%
  pivot_longer(
    everything(),
    names_to  = "topic",
    values_to = "mean_prop"
  ) %>%
  mutate(
    topic_num   = as.integer(sub("topic_", "", topic)),  
    mean_percent = mean_prop * 100                       
  ) %>%
  arrange(topic_num)

ggplot(topic_mean_long,
       aes(x = factor(topic_num), y = mean_percent)) +
  geom_col() +
  labs(
    title = "Average topic distribution across documents n\for Lda_model_f_w_opt_k_M",
    x     = "Topic",
    y     = "Share (%) (P(Ttopic | Document))"
  )

ggsave(
  filename = "Average_topic_distribution_across_documents_Lda_model_f_w_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)

rm(topic_mean_long)
rm(topic_share_corpus)
pdf(file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots",  "Topic_distribution_per_document_filtered_weighted_Dtm.pdf"), width = 8, height = 5) # For reproduction, specify your path

for (doc_id in doc_ids) {
  
  doc_topics <- DocTopics_long %>%
    filter(document == doc_id) %>%
    mutate(
      topic_num = as.integer(gsub("[^0-9]", "", as.character(topic))),
      topic_num = factor(topic_num, levels = 1:12)
    )
  
p <- ggplot(doc_topics, aes(x = topic_num, y = gamma * 100)) +
  geom_col() +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = paste("Topic distribution per document for Lda_model_f_w_opt_k_M", doc_id),
    x     = "Topic",
    y     = "Share (%) (P(Topic | Document))"
    ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p)

}

dev.off() 
## quartz_off_screen 
##                 2
rm(p)
rm(doc_topics)
rm(doc_ids)
# Result of topic modeling with 6 topics (k) and 7 words each topic (M), (beta = P(term|topic))
Topics <- tidy(Lda_model_f_w_opt_k_M, matrix = "beta") %>%
  group_by(topic) %>%
  slice_max(beta, n = opt_M_f_w, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(topic, desc(beta))

Topics_f_w_opt_k_M_wide <- Topics %>%
  group_by(topic) %>%
  mutate(rank = row_number()) %>%
  select(topic, rank, term) %>%
  pivot_wider(
    names_from = rank,
    values_from = term,
    names_prefix = "term_"
  ) %>%
  ungroup()


pdf(file = file.path(topics_path, "Topics_opt_k_M_f_w_wide.pdf"), width = 11.7, height = 8.3)

grid.table(Topics_f_w_opt_k_M_wide)
grid.text(
  label = "LDA based on filtered Dtm and weighted word occurence with 6 topics and 7 words",
  x = 0.5, y = 0.95,                   
  gp = gpar(fontsize = 16, fontface = "bold")  
)
dev.off()
## quartz_off_screen 
##                 2
set.seed(123)

dtm_sparse <- Matrix::sparseMatrix(
  i = Dtm_filt_weight$i, j = Dtm_filt_weight$j, x = Dtm_filt_weight$v,
  dims = c(Dtm_filt_weight$nrow, Dtm_filt_weight$ncol),
  dimnames = list(Docs(Dtm_filt_weight), Terms(Dtm_filt_weight))
)

# Phi (P(term|topic))
phi <- topicmodels::posterior(Lda_model_f_w_opt_k_M)$terms

# 3) Gleiche Term-Reihenfolge sicherstellen
common_terms <- intersect(colnames(phi), colnames(dtm_sparse))
stopifnot(length(common_terms) >= 6)
phi <- phi[, common_terms, drop = FALSE]
dtm_sparse <- dtm_sparse[, common_terms, drop = FALSE]

# Calculates coherence for 9 words per topic: M (= M) = 7
coherence_vec <- textmineR::CalcProbCoherence(phi = phi, dtm = dtm_sparse, M = opt_M_f_w)

# Mean across all topics
mean_coherence_f_w_opt_k_M <- mean(coherence_vec, na.rm = TRUE)
coherence_vec   # Coherence per topic
##          1          2 
## 0.08291539 0.14337816
mean_coherence_f_w_opt_k_M  # Average coherence score
## [1] 0.1131468
rm(common_terms)
rm(coherence_vec)
k_results <- Out$k_results

ggplot(k_results, aes(x = k, y = mean_coh)) +
  geom_line() +
  geom_point() +
  geom_errorbar(
    aes(ymin = mean_coh - sd_coh,
        ymax = mean_coh + sd_coh),
    width = 0.2
  ) +
  labs(
    x = "Number of topics (k)",
    y = "Mean probabilistic coherence",
    title = "Average probabilistic cohrence lda_model_f_w_opt_k_M"
  ) +
  theme_minimal()

ggsave(
  filename = "Average_probabilistic_cohrence_lda_model_f_w_opt_k_M.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
M <- ifelse(opt_M <= 10, opt_M_f_w, 10) # Restriction that Palmetto function just works for max 10 topwords

cv_f_w_opt_k_M <- cv_for_model(Lda_model_f_w_opt_k_M, M = M, coherence = "cv")

cv_f_w_opt_k_M_per_topic <- cv_f_w_opt_k_M$topic_cv   # c_v per topic
cv_f_w_opt_k_M_per_topic
## [1] 0.6057884 0.6515692
cv_av_f_w_opt_k_M <- cv_f_w_opt_k_M$mean_cv # mean c_v across topics
cv_av_f_w_opt_k_M
## [1] 0.6286788
# Installation guidance provided by github: https://github.com/c3nk/THE-World-University-Rankings/blob/main/the_university_rankings_full.py

# install.packages(c("httr2","jsonlite","dplyr","purrr","readr","fs","stringr"))
library(httr2)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(dplyr)
library(purrr)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:koRpus':
## 
##     tokenize
library(fs)
library(stringr)

BASE_URL <- "https://www.timeshighereducation.com/json/ranking_tables/world_university_rankings"

UA <- "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"

RANKINGS_FIELDS <- c(
  rank = "Rank",
  name = "Name",
  scores_overall = "Overall",
  scores_teaching = "Teaching",
  scores_research = "Research Environment",
  scores_citations = "Research Quality",
  scores_industry_income = "Industry",
  scores_international_outlook = "International Outlook"
)

KEY_STATISTICS_FIELDS <- c(
  rank = "Rank",
  name = "Name",
  stats_number_students = "No. of FTE students",
  stats_student_staff_ratio = "No. of students per staff",
  stats_pc_intl_students = "International students",
  stats_female_male_ratio = "Female:Male ratio"
)

fetch_json <- function(url) {
  resp <- request(url) |>
    req_user_agent(UA) |>
    req_headers(Accept = "application/json, text/plain, */*") |>
    req_timeout(60) |>
    req_perform()
  if (resp_status(resp) == 200) {
    resp |>
      resp_body_string() |>
      fromJSON(simplifyVector = FALSE)
  } else {
    message("[WARN] ", resp_status(resp), " for ", url)
    NULL
  }
}

filter_data_for_db <- function(data, year, field_mapping) {
  # Expect list with element "data"
  if (is.null(data) || is.null(data$data)) return(NULL)
  
  out <- map(data$data, function(u) {
    row <- list(year = year)
    
    for (json_field in names(field_mapping)) {
      db_field <- field_mapping[[json_field]]
      value <- u[[json_field]]
      # rank can be "=123"
      if (identical(json_field, "rank") && is.character(value) && str_starts(value, "=")) {
        row$rank_prefix <- "="
        row[[db_field]] <- sub("^=", "", value)
      } else {
        # clean numeric-ish strings for score and ratio fields
        if (str_starts(json_field, "scores_") || identical(json_field, "stats_student_staff_ratio")) {
          value <- if (is.null(value) || identical(value, "")) "" else gsub(",", "", as.character(value))
        }
        row[[db_field]] <- if (is.null(value)) "" else value
      }
    }
    row
  })
  
  # return as a data.frame-like list
  list(data = out)
}

save_outputs <- function(year, datalist, name) {
  if (is.null(datalist) || is.null(datalist$data)) {
    message("[WARN] No data for ", year, " ", name, ".")
    return(invisible(NULL))
  }
  dir_create("outputs/json")
  dir_create("outputs/csv")
  
  json_path <- file.path("outputs/json", sprintf("THE_%s_%s.json", year, name))
  writeLines(jsonlite::toJSON(datalist, auto_unbox = TRUE, pretty = TRUE, ensure_ascii = FALSE), json_path)
  
  df <- jsonlite::fromJSON(jsonlite::toJSON(datalist$data, auto_unbox = TRUE), flatten = TRUE)
  csv_path <- file.path("outputs/csv", sprintf("THE_%s_%s.csv", year, name))
  readr::write_csv(df, csv_path)
  message("[DONE] ", year, " ", name, ": ", nrow(df), " rows → ", csv_path)
}

process_year <- function(year, sleep_secs = 1) {
  message("\n=== YEAR ", year, " ===")
  
  # Rankings
  url_rank <- sprintf("%s/%s", BASE_URL, year)
  rankings <- fetch_json(url_rank)
  rankings_f <- filter_data_for_db(rankings, year, RANKINGS_FIELDS)
  save_outputs(year, rankings_f, "rankings")
  Sys.sleep(sleep_secs)
  
  # Key statistics
  url_key <- sprintf("%s/%s/key_statistics", BASE_URL, year)
  key_stats <- fetch_json(url_key)
  key_stats_f <- filter_data_for_db(key_stats, year, KEY_STATISTICS_FIELDS)
  save_outputs(year, key_stats_f, "key_statistics")
  Sys.sleep(sleep_secs)
  
  list(rankings = rankings_f, key_stats = key_stats_f)
}

# ---- Run for a range of years and also keep everything in-memory ----
years <- 2011:2026
all_raw <- map(years, process_year)
## 
## === YEAR 2011 ===
## [DONE] 2011 rankings: 200 rows → outputs/csv/THE_2011_rankings.csv
## [DONE] 2011 key_statistics: 200 rows → outputs/csv/THE_2011_key_statistics.csv
## 
## === YEAR 2012 ===
## [DONE] 2012 rankings: 402 rows → outputs/csv/THE_2012_rankings.csv
## [DONE] 2012 key_statistics: 402 rows → outputs/csv/THE_2012_key_statistics.csv
## 
## === YEAR 2013 ===
## [DONE] 2013 rankings: 400 rows → outputs/csv/THE_2013_rankings.csv
## [DONE] 2013 key_statistics: 400 rows → outputs/csv/THE_2013_key_statistics.csv
## 
## === YEAR 2014 ===
## [DONE] 2014 rankings: 400 rows → outputs/csv/THE_2014_rankings.csv
## [DONE] 2014 key_statistics: 400 rows → outputs/csv/THE_2014_key_statistics.csv
## 
## === YEAR 2015 ===
## [DONE] 2015 rankings: 401 rows → outputs/csv/THE_2015_rankings.csv
## [DONE] 2015 key_statistics: 401 rows → outputs/csv/THE_2015_key_statistics.csv
## 
## === YEAR 2016 ===
## [DONE] 2016 rankings: 800 rows → outputs/csv/THE_2016_rankings.csv
## [DONE] 2016 key_statistics: 800 rows → outputs/csv/THE_2016_key_statistics.csv
## 
## === YEAR 2017 ===
## [DONE] 2017 rankings: 981 rows → outputs/csv/THE_2017_rankings.csv
## [DONE] 2017 key_statistics: 981 rows → outputs/csv/THE_2017_key_statistics.csv
## 
## === YEAR 2018 ===
## [DONE] 2018 rankings: 1103 rows → outputs/csv/THE_2018_rankings.csv
## [DONE] 2018 key_statistics: 1103 rows → outputs/csv/THE_2018_key_statistics.csv
## 
## === YEAR 2019 ===
## [DONE] 2019 rankings: 1258 rows → outputs/csv/THE_2019_rankings.csv
## [DONE] 2019 key_statistics: 1258 rows → outputs/csv/THE_2019_key_statistics.csv
## 
## === YEAR 2020 ===
## [DONE] 2020 rankings: 1397 rows → outputs/csv/THE_2020_rankings.csv
## [DONE] 2020 key_statistics: 1397 rows → outputs/csv/THE_2020_key_statistics.csv
## 
## === YEAR 2021 ===
## [DONE] 2021 rankings: 1526 rows → outputs/csv/THE_2021_rankings.csv
## [DONE] 2021 key_statistics: 1526 rows → outputs/csv/THE_2021_key_statistics.csv
## 
## === YEAR 2022 ===
## [DONE] 2022 rankings: 2112 rows → outputs/csv/THE_2022_rankings.csv
## [DONE] 2022 key_statistics: 2112 rows → outputs/csv/THE_2022_key_statistics.csv
## 
## === YEAR 2023 ===
## [DONE] 2023 rankings: 2345 rows → outputs/csv/THE_2023_rankings.csv
## [DONE] 2023 key_statistics: 2345 rows → outputs/csv/THE_2023_key_statistics.csv
## 
## === YEAR 2024 ===
## [DONE] 2024 rankings: 2671 rows → outputs/csv/THE_2024_rankings.csv
## [DONE] 2024 key_statistics: 2671 rows → outputs/csv/THE_2024_key_statistics.csv
## 
## === YEAR 2025 ===
## [DONE] 2025 rankings: 2855 rows → outputs/csv/THE_2025_rankings.csv
## [DONE] 2025 key_statistics: 2855 rows → outputs/csv/THE_2025_key_statistics.csv
## 
## === YEAR 2026 ===
## [DONE] 2026 rankings: 3118 rows → outputs/csv/THE_2026_rankings.csv
## [DONE] 2026 key_statistics: 3118 rows → outputs/csv/THE_2026_key_statistics.csv
# In-memory tibbles you can use right away
rankings_tbl <- map2_dfr(years, all_raw, function(y, lst) {
  if (is.null(lst$rankings) || is.null(lst$rankings$data)) return(tibble())
  as_tibble(jsonlite::fromJSON(jsonlite::toJSON(lst$rankings$data, auto_unbox = TRUE)))
})

key_stats_tbl <- map2_dfr(years, all_raw, function(y, lst) {
  if (is.null(lst$key_stats) || is.null(lst$key_stats$data)) return(tibble())
  as_tibble(jsonlite::fromJSON(jsonlite::toJSON(lst$key_stats$data, auto_unbox = TRUE)))
})

# Peek
dplyr::glimpse(rankings_tbl)
## Rows: 21,969
## Columns: 10
## $ year                    <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Rank                    <chr> "1", "2", "3", "4", "5", "6", "6", "8", "9", "…
## $ Name                    <chr> "Harvard University", "California Institute of…
## $ Overall                 <chr> "96.1", "96.0", "95.6", "94.3", "94.2", "91.2"…
## $ Teaching                <chr> "99.7", "97.7", "97.8", "98.3", "90.9", "88.2"…
## $ `Research Environment`  <chr> "98.7", "98.0", "91.4", "98.1", "95.4", "93.9"…
## $ `Research Quality`      <chr> "98.8", "99.9", "99.9", "99.2", "99.9", "95.1"…
## $ Industry                <chr> "34.5", "83.7", "87.5", "64.3", "-", "73.5", "…
## $ `International Outlook` <chr> "72.4", "54.6", "82.3", "29.5", "70.3", "77.2"…
## $ rank_prefix             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
dplyr::glimpse(key_stats_tbl)
## Rows: 21,969
## Columns: 8
## $ year                        <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, …
## $ Rank                        <chr> "1", "2", "3", "4", "5", "6", "6", "8", "9…
## $ Name                        <chr> "Harvard University", "California Institut…
## $ `No. of FTE students`       <chr> "", "", "", "", "", "", "", "", "", "", ""…
## $ `No. of students per staff` <chr> "", "", "", "", "", "", "", "", "", "", ""…
## $ `International students`    <chr> "", "", "", "", "", "", "", "", "", "", ""…
## $ `Female:Male ratio`         <chr> "", "", "", "", "", "", "", "", "", "", ""…
## $ rank_prefix                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
THE_ranking <- rankings_tbl
THE_stats <- key_stats_tbl

rm(rankings_tbl)
rm(key_stats_tbl)

rm(Regression)
## Warning in rm(Regression): object 'Regression' not found
write_xlsx(
  THE_ranking,
  path = file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Data", 
                   "THE ranking 2011-2026.xlsx") # For reproduction, specify your path
)
  
  write_xlsx(
 THE_stats,
  path = file.path("/Users/henribisch-chandaroff/Desktop/Studium/Master/Master Thesis/Data Analysis/Data", 
                   "THE key stats 2011-2026.xlsx") # For reproduction, specify your path
)
#library(dplyr)
#library(tidyr)
#library(stringr)
#library(stringi)

years_full <- 2011:2026

# first non-NA (if duplicates per Name-year exist)
first_non_na <- function(z) {
  z <- z[!is.na(z)]
  if (length(z) == 0) NA else z[1]
}

# Cleaning and data processing of THE Data
Unique_uni <- Data %>%
  mutate(Name = clean_name(Name)) %>%
  distinct(Name)

THE_ranking <- THE_ranking %>%
  mutate(
    Name = clean_name(Name),
    year = as.integer(year)
  ) %>%
  mutate(across(where(is.character), ~ na_if(.x, "na"))) %>%
  mutate(across(where(is.character), ~ na_if(.x, "NA"))) %>%
  mutate(across(where(is.character), ~ na_if(.x, "-"))) %>%
  group_by(Name, year) %>%
  summarise(across(everything(), first_non_na), .groups = "drop")

THE_ranking <- THE_ranking %>%
  mutate(
    Overall = gsub("–", "-", trimws(as.character(Overall))),   # normalize dash
    Mean_overall = ifelse(
      Overall %in% c("", "-", "NaN"),
      NA_real_,
      ifelse(
        grepl("-", Overall, fixed = TRUE),
        sapply(strsplit(Overall, "-", fixed = TRUE),
               function(z) mean(as.numeric(z), na.rm = TRUE)),
        suppressWarnings(as.numeric(Overall))
      )
    )
  )
## Warning: There were 4230 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Mean_overall = ifelse(...)`.
## Caused by warning in `mean()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 4229 remaining warnings.
# turn NaN (e.g., mean of all-NA) into NA
THE_ranking$Mean_overall[is.nan(THE_ranking$Mean_overall)] <- NA_real_

THE_ranking <- THE_ranking %>%
  rename(Year = year,
         Research_environment = "Research Environment",
         International_outlook = "International Outlook",
         Research_quality = "Research Quality")
#Matching 
Data$DocID <- as.character(Data$DocID)
DocTopic_wide_tf <- DocTopics_wide_tf %>% rename(DocID = document)

Regression <- Data %>%
  select(
    DocID, Year, Name, Country, Continent,
    No_Students, Director, Staff, TTO_founding_year, TTO_personnel
  ) %>%
  left_join(DocTopic_wide_tf, by = "DocID") %>%
  mutate(Year = suppressWarnings(as.integer(as.character(Year)))) %>%
  filter(!is.na(Year), Year >= 2010L)

# Building time lags
time_lag <- 1L # Adjust time lag here

Regression <- Regression %>%
  mutate(
    OutcomeYear = case_when(
      is.na(Year) ~ 2026L,
      tolower(trimws(as.character(Year))) %in% c("no date", "nodate", "no_date") ~ 2026L,
      is.na(suppressWarnings(as.integer(as.character(Year)))) ~ 2026L,
      suppressWarnings(as.integer(as.character(Year))) <= 2008L ~ 2011L,
      suppressWarnings(as.integer(as.character(Year))) >= 2025L ~ 2026L,
      TRUE ~ suppressWarnings(as.integer(as.character(Year))) + time_lag
    )
  )

sum(Regression$Year == 2025, na.rm = TRUE)
## [1] 107
THE_ranking <- THE_ranking %>%
  mutate(Year = suppressWarnings(as.integer(as.character(Year))))

Regression <- Regression %>%
  left_join(THE_ranking, by = c("Name" = "Name", "OutcomeYear" = "Year"))
library(dplyr)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'lmtest'
## The following object is masked from 'package:future':
## 
##     reset
library(sandwich)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
dropped_topic <- "topic_2"  # <- baseline topic to drop, adjust here
dv_vars <- c(
  "Mean_overall",
  "Teaching",
  "International_outlook",
  "Research_quality",
  "Research_environment",
  "Industry"
)

topic_cols <- grep("^topic_", names(Regression), value = TRUE)
stopifnot(dropped_topic %in% topic_cols)

x_cols_base <- setdiff(topic_cols, dropped_topic)

# Directir Dummy
stopifnot("Director" %in% names(Regression))

Regression <- Regression %>%
  mutate(
    Director = suppressWarnings(as.numeric(as.character(Director))),
    Director = if_else(is.na(Director), 0, if_else(Director > 0, 1, 0))
  )

x_cols_base <- c(x_cols_base, "Director")

# Run model function
run_model <- function(dv, x_cols_base, data = Regression) {

  df_reg <- data %>%
    transmute(
      Name,
      y = suppressWarnings(as.numeric(as.character(.data[[dv]]))),
      across(all_of(x_cols_base), ~ suppressWarnings(as.numeric(as.character(.x))))
    ) %>%
    filter(is.finite(y)) %>%
    filter(if_all(all_of(x_cols_base), is.finite))

  # drop zero-variance predictors within THIS sample
  zero_var <- sapply(df_reg[x_cols_base], function(z) sd(z, na.rm = TRUE) == 0)
  zero_var[is.na(zero_var)] <- FALSE
  x_use <- x_cols_base[!zero_var]

  if (length(x_use) == 0) stop("No predictors left after zero-variance drop for DV: ", dv)

  # scale topic shares only (keep Director as 0/1 for interpretation)
  topic_like <- intersect(x_use, grep("^topic_", x_use, value = TRUE))
  df_reg <- df_reg %>%
    mutate(across(all_of(topic_like), ~ as.numeric(scale(.x))))

  # fit OLS
  fml <- as.formula(paste("y ~", paste(x_use, collapse = " + ")))
  fit <- lm(fml, data = df_reg)

  # cluster-robust SEs (HC3) at institution level
  vc <- sandwich::vcovCL(fit, cluster = df_reg$Name, type = "HC3")
  se_cl <- sqrt(diag(vc))
  rob <- lmtest::coeftest(fit, vcov. = vc)

  list(dv = dv, fit = fit, df_reg = df_reg, se = se_cl, robust = rob, x_use = x_use)
}

# Runs all models
mods <- lapply(dv_vars, run_model, x_cols_base = x_cols_base, data = Regression)
names(mods) <- dv_vars

# Resultsa with cluster robust SE`s
for (dv in dv_vars) {
  fit <- mods[[dv]]$fit
  s   <- summary(fit)

  cat("\n==============================\n")
  cat("DV:", dv, "\n")
  cat("Observations (N):", nobs(fit), "\n")
  cat("Clusters (institutions):", dplyr::n_distinct(mods[[dv]]$df_reg$Name), "\n")
  cat("R2:", round(s$r.squared, 3), "\n")
  cat("Adj. R2:", round(s$adj.r.squared, 3), "\n\n")

  print(mods[[dv]]$robust)
}
## 
## ==============================
## DV: Mean_overall 
## Observations (N): 135 
## Clusters (institutions): 101 
## R2: 0.024 
## Adj. R2: -0.022 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.27201    3.61630 16.9433   <2e-16 ***
## topic_1      1.25487    3.83085  0.3276   0.7438    
## topic_3      0.89174    2.83473  0.3146   0.7536    
## topic_4      0.45814    3.14790  0.1455   0.8845    
## topic_5      0.43267    3.29193  0.1314   0.8956    
## topic_6      3.50034    3.82978  0.9140   0.3624    
## Director    -2.27917    3.75191 -0.6075   0.5446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Teaching 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.028 
## Adj. R2: -0.017 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.29694    4.17804 12.2778   <2e-16 ***
## topic_1      2.00885    3.71064  0.5414   0.5892    
## topic_3      1.72554    2.66093  0.6485   0.5178    
## topic_4      0.55361    3.17372  0.1744   0.8618    
## topic_5      0.39453    3.35923  0.1174   0.9067    
## topic_6      4.29849    3.86260  1.1128   0.2678    
## Director    -1.41321    4.20113 -0.3364   0.7371    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: International_outlook 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.122 
## Adj. R2: 0.082 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.46051    2.53070 26.2617  < 2e-16 ***
## topic_1     -0.38881    2.35378 -0.1652  0.86905    
## topic_3     -4.52623    2.06564 -2.1912  0.03021 *  
## topic_4     -4.21546    2.18100 -1.9328  0.05542 .  
## topic_5     -4.43608    2.15025 -2.0631  0.04108 *  
## topic_6     -6.13640    2.42259 -2.5330  0.01249 *  
## Director     0.22093    2.86800  0.0770  0.93871    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Research_quality 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.056 
## Adj. R2: 0.012 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78.59197    3.30845 23.7550   <2e-16 ***
## topic_1      2.49727    2.62327  0.9520   0.3429    
## topic_3      1.03804    2.34010  0.4436   0.6581    
## topic_4      0.29243    2.83951  0.1030   0.9181    
## topic_5      1.85163    2.43284  0.7611   0.4480    
## topic_6      4.76351    2.57802  1.8477   0.0669 .  
## Director    -1.86340    3.52505 -0.5286   0.5980    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Research_environment 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.019 
## Adj. R2: -0.025 
## 
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.390406   4.568642 11.2485   <2e-16 ***
## topic_1      0.354732   4.137492  0.0857   0.9318    
## topic_3     -0.054804   3.170964 -0.0173   0.9862    
## topic_4     -1.356168   3.744339 -0.3622   0.7178    
## topic_5     -1.527739   3.889258 -0.3928   0.6951    
## topic_6      2.124749   4.528675  0.4692   0.6397    
## Director    -2.836162   4.772302 -0.5943   0.5533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Industry 
## Observations (N): 137 
## Clusters (institutions): 104 
## R2: 0.079 
## Adj. R2: 0.036 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  64.2650     3.4191 18.7958 < 2.2e-16 ***
## topic_1       1.5804     2.7315  0.5786  0.563877    
## topic_3       1.1504     2.0741  0.5547  0.580085    
## topic_4       4.5348     2.3744  1.9099  0.058352 .  
## topic_5       6.6282     2.1897  3.0271  0.002978 ** 
## topic_6       4.0435     2.6665  1.5164  0.131855    
## Director      2.6835     3.6874  0.7277  0.468081    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Creates Latex Table
fits1 <- lapply(mods[dv_vars[1:3]], `[[`, "fit")
ses1  <- lapply(mods[dv_vars[1:3]], `[[`, "se")

stargazer(
  fits1,
  type = "latex",
  se = ses1,
  title = "Regression Results (Cluster-robust SEs)",
  column.labels = c("Mean overall", "Teaching", "International outlook"),
  model.numbers = FALSE,
  dep.var.caption = "",
  dep.var.labels.include = FALSE,
  digits = 3,
  no.space = TRUE,
  omit.stat = c("ser", "f"),  
  out = "regression_table_1.tex"
)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Mar 20, 2026 - 16:24:39
## \begin{table}[!htbp] \centering 
##   \caption{Regression Results (Cluster-robust SEs)} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & Mean overall & Teaching & International outlook \\ 
##  topic\_1 & 1.255 & 2.009 & $-$0.389 \\ 
##   & (3.831) & (3.711) & (2.354) \\ 
##   topic\_3 & 0.892 & 1.726 & $-$4.526$^{**}$ \\ 
##   & (2.835) & (2.661) & (2.066) \\ 
##   topic\_4 & 0.458 & 0.554 & $-$4.215$^{*}$ \\ 
##   & (3.148) & (3.174) & (2.181) \\ 
##   topic\_5 & 0.433 & 0.395 & $-$4.436$^{**}$ \\ 
##   & (3.292) & (3.359) & (2.150) \\ 
##   topic\_6 & 3.500 & 4.298 & $-$6.136$^{**}$ \\ 
##   & (3.830) & (3.863) & (2.423) \\ 
##   Director & $-$2.279 & $-$1.413 & 0.221 \\ 
##   & (3.752) & (4.201) & (2.868) \\ 
##   Constant & 61.272$^{***}$ & 51.297$^{***}$ & 66.461$^{***}$ \\ 
##   & (3.616) & (4.178) & (2.531) \\ 
##  \hline \\[-1.8ex] 
## Observations & 135 & 138 & 138 \\ 
## R$^{2}$ & 0.024 & 0.028 & 0.122 \\ 
## Adjusted R$^{2}$ & $-$0.022 & $-$0.017 & 0.082 \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{3}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
fits2 <- lapply(mods[dv_vars[4:6]], `[[`, "fit")
ses2  <- lapply(mods[dv_vars[4:6]], `[[`, "se")

stargazer(
  fits2,
  type = "latex",
  se = ses2,
  title = "Regression Results (Cluster-robust SEs)",
  column.labels = c("Research quality", "Research environment", "Industry"),
  model.numbers = FALSE,
  dep.var.caption = "",
  dep.var.labels.include = FALSE,
  digits = 3,
  no.space = TRUE,
  omit.stat = c("ser", "f"),
  out = "regression_table_2.tex"
)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Mar 20, 2026 - 16:24:39
## \begin{table}[!htbp] \centering 
##   \caption{Regression Results (Cluster-robust SEs)} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & Research quality & Research environment & Industry \\ 
##  topic\_1 & 2.497 & 0.355 & 1.580 \\ 
##   & (2.623) & (4.137) & (2.732) \\ 
##   topic\_3 & 1.038 & $-$0.055 & 1.150 \\ 
##   & (2.340) & (3.171) & (2.074) \\ 
##   topic\_4 & 0.292 & $-$1.356 & 4.535$^{*}$ \\ 
##   & (2.840) & (3.744) & (2.374) \\ 
##   topic\_5 & 1.852 & $-$1.528 & 6.628$^{***}$ \\ 
##   & (2.433) & (3.889) & (2.190) \\ 
##   topic\_6 & 4.764$^{*}$ & 2.125 & 4.043 \\ 
##   & (2.578) & (4.529) & (2.667) \\ 
##   Director & $-$1.863 & $-$2.836 & 2.684 \\ 
##   & (3.525) & (4.772) & (3.687) \\ 
##   Constant & 78.592$^{***}$ & 51.390$^{***}$ & 64.265$^{***}$ \\ 
##   & (3.308) & (4.569) & (3.419) \\ 
##  \hline \\[-1.8ex] 
## Observations & 138 & 138 & 137 \\ 
## R$^{2}$ & 0.056 & 0.019 & 0.079 \\ 
## Adjusted R$^{2}$ & 0.012 & $-$0.025 & 0.036 \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{3}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
time_lag <- 3L # <- Adjsut here

Regression <- Regression %>%
  mutate(
    OutcomeYear = case_when(
      is.na(Year) ~ 2026L,
      tolower(trimws(as.character(Year))) %in% c("no date", "nodate", "no_date") ~ 2026L,
      is.na(suppressWarnings(as.integer(as.character(Year)))) ~ 2026L,
      suppressWarnings(as.integer(as.character(Year))) <= 2008L ~ 2011L,
      suppressWarnings(as.integer(as.character(Year))) >= 2025L ~ 2026L,
      TRUE ~ suppressWarnings(as.integer(as.character(Year))) + time_lag
    )
  )

sum(Regression$Year == 2025, na.rm = TRUE)
## [1] 107
dropped_topic <- "topic_2"  # <- baseline topic to drop, can be modified here
dv_vars <- c(
  "Mean_overall",
  "Teaching",
  "International_outlook",
  "Research_quality",
  "Research_environment",
  "Industry"
)

topic_cols <- grep("^topic_", names(Regression), value = TRUE)
stopifnot(dropped_topic %in% topic_cols)

x_cols_base <- setdiff(topic_cols, dropped_topic)

# Creates Director Dummy
stopifnot("Director" %in% names(Regression))

Regression <- Regression %>%
  mutate(
    Director = suppressWarnings(as.numeric(as.character(Director))),
    Director = if_else(is.na(Director), 0, if_else(Director > 0, 1, 0))
  )

# add Director to predictors
x_cols_base <- c(x_cols_base, "Director")

# Builds model function
run_model <- function(dv, x_cols_base, data = Regression) {

  df_reg <- data %>%
    transmute(
      Name,
      y = suppressWarnings(as.numeric(as.character(.data[[dv]]))),
      across(all_of(x_cols_base), ~ suppressWarnings(as.numeric(as.character(.x))))
    ) %>%
    filter(is.finite(y)) %>%
    filter(if_all(all_of(x_cols_base), is.finite))

  # drop zero-variance predictors within THIS sample
  zero_var <- sapply(df_reg[x_cols_base], function(z) sd(z, na.rm = TRUE) == 0)
  zero_var[is.na(zero_var)] <- FALSE
  x_use <- x_cols_base[!zero_var]

  if (length(x_use) == 0) stop("No predictors left after zero-variance drop for DV: ", dv)

  # scale topic shares only (keep Director as 0/1 for interpretation)
  topic_like <- intersect(x_use, grep("^topic_", x_use, value = TRUE))
  df_reg <- df_reg %>%
    mutate(across(all_of(topic_like), ~ as.numeric(scale(.x))))

  # fit OLS
  fml <- as.formula(paste("y ~", paste(x_use, collapse = " + ")))
  fit <- lm(fml, data = df_reg)

  # cluster-robust SEs (HC3) at institution level
  vc <- sandwich::vcovCL(fit, cluster = df_reg$Name, type = "HC3")
  se_cl <- sqrt(diag(vc))
  rob <- lmtest::coeftest(fit, vcov. = vc)

  list(dv = dv, fit = fit, df_reg = df_reg, se = se_cl, robust = rob, x_use = x_use)
}

# Runs all models
mods <- lapply(dv_vars, run_model, x_cols_base = x_cols_base, data = Regression)
names(mods) <- dv_vars

for (dv in dv_vars) {
  fit <- mods[[dv]]$fit
  s   <- summary(fit)

  cat("\n==============================\n")
  cat("DV:", dv, "\n")
  cat("Observations (N):", nobs(fit), "\n")
  cat("Clusters (institutions):", dplyr::n_distinct(mods[[dv]]$df_reg$Name), "\n")
  cat("R2:", round(s$r.squared, 3), "\n")
  cat("Adj. R2:", round(s$adj.r.squared, 3), "\n\n")

  print(mods[[dv]]$robust)
}
## 
## ==============================
## DV: Mean_overall 
## Observations (N): 135 
## Clusters (institutions): 101 
## R2: 0.024 
## Adj. R2: -0.022 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.27201    3.61630 16.9433   <2e-16 ***
## topic_1      1.25487    3.83085  0.3276   0.7438    
## topic_3      0.89174    2.83473  0.3146   0.7536    
## topic_4      0.45814    3.14790  0.1455   0.8845    
## topic_5      0.43267    3.29193  0.1314   0.8956    
## topic_6      3.50034    3.82978  0.9140   0.3624    
## Director    -2.27917    3.75191 -0.6075   0.5446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Teaching 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.028 
## Adj. R2: -0.017 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.29694    4.17804 12.2778   <2e-16 ***
## topic_1      2.00885    3.71064  0.5414   0.5892    
## topic_3      1.72554    2.66093  0.6485   0.5178    
## topic_4      0.55361    3.17372  0.1744   0.8618    
## topic_5      0.39453    3.35923  0.1174   0.9067    
## topic_6      4.29849    3.86260  1.1128   0.2678    
## Director    -1.41321    4.20113 -0.3364   0.7371    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: International_outlook 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.122 
## Adj. R2: 0.082 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 66.46051    2.53070 26.2617  < 2e-16 ***
## topic_1     -0.38881    2.35378 -0.1652  0.86905    
## topic_3     -4.52623    2.06564 -2.1912  0.03021 *  
## topic_4     -4.21546    2.18100 -1.9328  0.05542 .  
## topic_5     -4.43608    2.15025 -2.0631  0.04108 *  
## topic_6     -6.13640    2.42259 -2.5330  0.01249 *  
## Director     0.22093    2.86800  0.0770  0.93871    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Research_quality 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.056 
## Adj. R2: 0.012 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78.59197    3.30845 23.7550   <2e-16 ***
## topic_1      2.49727    2.62327  0.9520   0.3429    
## topic_3      1.03804    2.34010  0.4436   0.6581    
## topic_4      0.29243    2.83951  0.1030   0.9181    
## topic_5      1.85163    2.43284  0.7611   0.4480    
## topic_6      4.76351    2.57802  1.8477   0.0669 .  
## Director    -1.86340    3.52505 -0.5286   0.5980    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Research_environment 
## Observations (N): 138 
## Clusters (institutions): 104 
## R2: 0.019 
## Adj. R2: -0.025 
## 
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51.390406   4.568642 11.2485   <2e-16 ***
## topic_1      0.354732   4.137492  0.0857   0.9318    
## topic_3     -0.054804   3.170964 -0.0173   0.9862    
## topic_4     -1.356168   3.744339 -0.3622   0.7178    
## topic_5     -1.527739   3.889258 -0.3928   0.6951    
## topic_6      2.124749   4.528675  0.4692   0.6397    
## Director    -2.836162   4.772302 -0.5943   0.5533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Industry 
## Observations (N): 137 
## Clusters (institutions): 104 
## R2: 0.079 
## Adj. R2: 0.036 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  64.2650     3.4191 18.7958 < 2.2e-16 ***
## topic_1       1.5804     2.7315  0.5786  0.563877    
## topic_3       1.1504     2.0741  0.5547  0.580085    
## topic_4       4.5348     2.3744  1.9099  0.058352 .  
## topic_5       6.6282     2.1897  3.0271  0.002978 ** 
## topic_6       4.0435     2.6665  1.5164  0.131855    
## Director      2.6835     3.6874  0.7277  0.468081    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Prints latex readable results with cluster robust SEs
fits1 <- lapply(mods[dv_vars[1:3]], `[[`, "fit")
ses1  <- lapply(mods[dv_vars[1:3]], `[[`, "se")

stargazer(
  fits1,
  type = "latex",
  se = ses1,
  title = "Regression Results (Cluster-robust SEs)",
  column.labels = c("Mean overall", "Teaching", "International outlook"),
  model.numbers = FALSE,
  dep.var.caption = "",
  dep.var.labels.include = FALSE,
  digits = 3,
  no.space = TRUE,
  omit.stat = c("ser", "f"),  
  out = "regression_table_1.tex"
)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Mar 20, 2026 - 16:24:39
## \begin{table}[!htbp] \centering 
##   \caption{Regression Results (Cluster-robust SEs)} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & Mean overall & Teaching & International outlook \\ 
##  topic\_1 & 1.255 & 2.009 & $-$0.389 \\ 
##   & (3.831) & (3.711) & (2.354) \\ 
##   topic\_3 & 0.892 & 1.726 & $-$4.526$^{**}$ \\ 
##   & (2.835) & (2.661) & (2.066) \\ 
##   topic\_4 & 0.458 & 0.554 & $-$4.215$^{*}$ \\ 
##   & (3.148) & (3.174) & (2.181) \\ 
##   topic\_5 & 0.433 & 0.395 & $-$4.436$^{**}$ \\ 
##   & (3.292) & (3.359) & (2.150) \\ 
##   topic\_6 & 3.500 & 4.298 & $-$6.136$^{**}$ \\ 
##   & (3.830) & (3.863) & (2.423) \\ 
##   Director & $-$2.279 & $-$1.413 & 0.221 \\ 
##   & (3.752) & (4.201) & (2.868) \\ 
##   Constant & 61.272$^{***}$ & 51.297$^{***}$ & 66.461$^{***}$ \\ 
##   & (3.616) & (4.178) & (2.531) \\ 
##  \hline \\[-1.8ex] 
## Observations & 135 & 138 & 138 \\ 
## R$^{2}$ & 0.024 & 0.028 & 0.122 \\ 
## Adjusted R$^{2}$ & $-$0.022 & $-$0.017 & 0.082 \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{3}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
fits2 <- lapply(mods[dv_vars[4:6]], `[[`, "fit")
ses2  <- lapply(mods[dv_vars[4:6]], `[[`, "se")

stargazer(
  fits2,
  type = "latex",
  se = ses2,
  title = "Regression Results (Cluster-robust SEs)",
  column.labels = c("Research quality", "Research environment", "Industry"),
  model.numbers = FALSE,
  dep.var.caption = "",
  dep.var.labels.include = FALSE,
  digits = 3,
  no.space = TRUE,
  omit.stat = c("ser", "f"),  
  out = "regression_table_2.tex"
)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Mar 20, 2026 - 16:24:40
## \begin{table}[!htbp] \centering 
##   \caption{Regression Results (Cluster-robust SEs)} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & Research quality & Research environment & Industry \\ 
##  topic\_1 & 2.497 & 0.355 & 1.580 \\ 
##   & (2.623) & (4.137) & (2.732) \\ 
##   topic\_3 & 1.038 & $-$0.055 & 1.150 \\ 
##   & (2.340) & (3.171) & (2.074) \\ 
##   topic\_4 & 0.292 & $-$1.356 & 4.535$^{*}$ \\ 
##   & (2.840) & (3.744) & (2.374) \\ 
##   topic\_5 & 1.852 & $-$1.528 & 6.628$^{***}$ \\ 
##   & (2.433) & (3.889) & (2.190) \\ 
##   topic\_6 & 4.764$^{*}$ & 2.125 & 4.043 \\ 
##   & (2.578) & (4.529) & (2.667) \\ 
##   Director & $-$1.863 & $-$2.836 & 2.684 \\ 
##   & (3.525) & (4.772) & (3.687) \\ 
##   Constant & 78.592$^{***}$ & 51.390$^{***}$ & 64.265$^{***}$ \\ 
##   & (3.308) & (4.569) & (3.419) \\ 
##  \hline \\[-1.8ex] 
## Observations & 138 & 138 & 137 \\ 
## R$^{2}$ & 0.056 & 0.019 & 0.079 \\ 
## Adjusted R$^{2}$ & 0.012 & $-$0.025 & 0.036 \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{3}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
library(dplyr)
library(lmtest)
library(sandwich)
library(car)
library(stargazer)

# How many observations are affected by Trumps lega
# Obs_Trump_period <- Regression %>%
#  filter(!is.na(Mean_overall)) %>%
# filter(Country == "United States") %>%
#  filter((Year >= 2017L & Year <= 2021L) | Year >= 2025L)

# nrow(Obs_Trump_period)/ 138

dropped_topic <- "topic_6"  # <- baseline topic to drop, can be modified here
dv_vars <- c(
  "Mean_overall",
  "Teaching",
  "International_outlook",
  "Research_quality",
  "Research_environment",
  "Industry"
)

topic_cols <- grep("^topic_", names(Regression), value = TRUE)
stopifnot(dropped_topic %in% topic_cols)

x_cols_base <- setdiff(topic_cols, dropped_topic)

# Builds director Dummy
stopifnot("Director" %in% names(Regression))

Regression <- Regression %>%
  mutate(
    Director = suppressWarnings(as.numeric(as.character(Director))),
    Director = if_else(is.na(Director), 0, if_else(Director > 0, 1, 0))
  )
x_cols_base <- c(x_cols_base, "Director")

# Function for running the models
run_model <- function(dv, x_cols_base, data = Regression) {

  df_reg <- data %>%
    transmute(
      Name,
      y = suppressWarnings(as.numeric(as.character(.data[[dv]]))),
      across(all_of(x_cols_base), ~ suppressWarnings(as.numeric(as.character(.x))))
    ) %>%
    filter(is.finite(y)) %>%
    filter(if_all(all_of(x_cols_base), is.finite))

  # drop zero-variance predictors within THIS sample
  zero_var <- sapply(df_reg[x_cols_base], function(z) sd(z, na.rm = TRUE) == 0)
  zero_var[is.na(zero_var)] <- FALSE
  x_use <- x_cols_base[!zero_var]

  if (length(x_use) == 0) stop("No predictors left after zero-variance drop for DV: ", dv)

  # scale topic shares only (keep Director as 0/1 for interpretation)
  topic_like <- intersect(x_use, grep("^topic_", x_use, value = TRUE))
  df_reg <- df_reg %>%
    mutate(across(all_of(topic_like), ~ as.numeric(scale(.x))))

  # fit OLS
  fml <- as.formula(paste("y ~", paste(x_use, collapse = " + ")))
  fit <- lm(fml, data = df_reg)

  # cluster-robust SEs (HC3) at institution level
  vc <- sandwich::vcovCL(fit, cluster = df_reg$Name, type = "HC3")
  se_cl <- sqrt(diag(vc))
  rob <- lmtest::coeftest(fit, vcov. = vc)

  list(dv = dv, fit = fit, df_reg = df_reg, se = se_cl, robust = rob, x_use = x_use)
}

Regression_US <- Regression %>%
  filter(Country != "United States")

#Runs models
mods <- lapply(dv_vars, run_model, x_cols_base = x_cols_base, data = Regression_US)
names(mods) <- dv_vars

for (dv in dv_vars) {
  fit <- mods[[dv]]$fit
  s   <- summary(fit)

  cat("\n==============================\n")
  cat("DV:", dv, "\n")
  cat("Observations (N):", nobs(fit), "\n")
  cat("Clusters (institutions):", dplyr::n_distinct(mods[[dv]]$df_reg$Name), "\n")
  cat("R2:", round(s$r.squared, 3), "\n")
  cat("Adj. R2:", round(s$adj.r.squared, 3), "\n\n")

  print(mods[[dv]]$robust)
}
## 
## ==============================
## DV: Mean_overall 
## Observations (N): 53 
## Clusters (institutions): 45 
## R2: 0.024 
## Adj. R2: -0.103 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 58.11106    8.37586  6.9379 1.129e-08 ***
## topic_1     -0.78179    5.65609 -0.1382    0.8907    
## topic_2     -1.68759    7.15575 -0.2358    0.8146    
## topic_3      1.12144    6.30049  0.1780    0.8595    
## topic_4     -2.40204    5.27763 -0.4551    0.6512    
## topic_5     -2.47679    3.63222 -0.6819    0.4987    
## Director    -3.77079    8.66612 -0.4351    0.6655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Teaching 
## Observations (N): 56 
## Clusters (institutions): 48 
## R2: 0.039 
## Adj. R2: -0.079 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 46.45321    8.47943  5.4783 1.473e-06 ***
## topic_1      0.40631    5.25822  0.0773    0.9387    
## topic_2     -0.19091    7.19356 -0.0265    0.9789    
## topic_3      2.16845    5.77331  0.3756    0.7088    
## topic_4     -2.54195    5.14744 -0.4938    0.6236    
## topic_5     -2.44541    3.95013 -0.6191    0.5387    
## Director    -2.69389    8.42071 -0.3199    0.7504    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: International_outlook 
## Observations (N): 56 
## Clusters (institutions): 48 
## R2: 0.16 
## Adj. R2: 0.057 
## 
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 73.252698   5.417022 13.5227   <2e-16 ***
## topic_1      5.142589   4.574779  1.1241   0.2664    
## topic_2     -0.024089   5.844057 -0.0041   0.9967    
## topic_3      0.428354   3.662480  0.1170   0.9074    
## topic_4     -2.563731   5.130939 -0.4997   0.6196    
## topic_5     -2.252051   3.394640 -0.6634   0.5102    
## Director     3.370803   5.622855  0.5995   0.5516    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Research_quality 
## Observations (N): 56 
## Clusters (institutions): 48 
## R2: 0.057 
## Adj. R2: -0.059 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 73.83944    9.09914  8.1150 1.275e-10 ***
## topic_1      0.18394    5.22922  0.0352    0.9721    
## topic_2     -2.77542    5.65293 -0.4910    0.6256    
## topic_3     -0.30640    6.80662 -0.0450    0.9643    
## topic_4     -4.47725    5.87399 -0.7622    0.4496    
## topic_5     -2.61257    3.23776 -0.8069    0.4236    
## Director    -3.40857    9.55896 -0.3566    0.7229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Research_environment 
## Observations (N): 56 
## Clusters (institutions): 48 
## R2: 0.033 
## Adj. R2: -0.085 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 48.51787   10.04517  4.8300 1.382e-05 ***
## topic_1     -0.19094    6.09092 -0.0313    0.9751    
## topic_2     -0.49187    8.36663 -0.0588    0.9534    
## topic_3      0.86053    6.90896  0.1246    0.9014    
## topic_4     -3.61698    5.95288 -0.6076    0.5463    
## topic_5     -3.20470    4.45123 -0.7200    0.4750    
## Director    -2.52780    9.94316 -0.2542    0.8004    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## ==============================
## DV: Industry 
## Observations (N): 55 
## Clusters (institutions): 48 
## R2: 0.162 
## Adj. R2: 0.057 
## 
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 63.62789    6.04655 10.5230 4.648e-14 ***
## topic_1     -4.32212    5.19696 -0.8317    0.4097    
## topic_2     -3.94246    6.14690 -0.6414    0.5243    
## topic_3      3.34962    3.47302  0.9645    0.3396    
## topic_4      0.36944    6.04926  0.0611    0.9516    
## topic_5      5.47506    5.98413  0.9149    0.3648    
## Director    -1.43240    7.16399 -0.1999    0.8424    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Export Latex table with cluster robust SEs
fits1 <- lapply(mods[dv_vars[1:3]], `[[`, "fit")
ses1  <- lapply(mods[dv_vars[1:3]], `[[`, "se")

stargazer(
  fits1,
  type = "latex",
  se = ses1,
  title = "Regression Results (Cluster-robust SEs)",
  column.labels = c("Mean overall", "Teaching", "International outlook"),
  model.numbers = FALSE,
  dep.var.caption = "",
  dep.var.labels.include = FALSE,
  digits = 3,
  no.space = TRUE,
  omit.stat = c("ser", "f"),  
  out = "regression_table_1.tex"
)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Mar 20, 2026 - 16:24:40
## \begin{table}[!htbp] \centering 
##   \caption{Regression Results (Cluster-robust SEs)} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & Mean overall & Teaching & International outlook \\ 
##  topic\_1 & $-$0.782 & 0.406 & 5.143 \\ 
##   & (5.656) & (5.258) & (4.575) \\ 
##   topic\_2 & $-$1.688 & $-$0.191 & $-$0.024 \\ 
##   & (7.156) & (7.194) & (5.844) \\ 
##   topic\_3 & 1.121 & 2.168 & 0.428 \\ 
##   & (6.300) & (5.773) & (3.662) \\ 
##   topic\_4 & $-$2.402 & $-$2.542 & $-$2.564 \\ 
##   & (5.278) & (5.147) & (5.131) \\ 
##   topic\_5 & $-$2.477 & $-$2.445 & $-$2.252 \\ 
##   & (3.632) & (3.950) & (3.395) \\ 
##   Director & $-$3.771 & $-$2.694 & 3.371 \\ 
##   & (8.666) & (8.421) & (5.623) \\ 
##   Constant & 58.111$^{***}$ & 46.453$^{***}$ & 73.253$^{***}$ \\ 
##   & (8.376) & (8.479) & (5.417) \\ 
##  \hline \\[-1.8ex] 
## Observations & 53 & 56 & 56 \\ 
## R$^{2}$ & 0.024 & 0.039 & 0.160 \\ 
## Adjusted R$^{2}$ & $-$0.103 & $-$0.079 & 0.057 \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{3}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
fits2 <- lapply(mods[dv_vars[4:6]], `[[`, "fit")
ses2  <- lapply(mods[dv_vars[4:6]], `[[`, "se")

stargazer(
  fits2,
  type = "latex",
  se = ses2,
  title = "Regression Results (Cluster-robust SEs)",
  column.labels = c("Research quality", "Research environment", "Industry"),
  model.numbers = FALSE,
  dep.var.caption = "",
  dep.var.labels.include = FALSE,
  digits = 3,
  no.space = TRUE,
  omit.stat = c("ser", "f"),  
  out = "regression_table_2.tex"
)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: Fri, Mar 20, 2026 - 16:24:40
## \begin{table}[!htbp] \centering 
##   \caption{Regression Results (Cluster-robust SEs)} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & Research quality & Research environment & Industry \\ 
##  topic\_1 & 0.184 & $-$0.191 & $-$4.322 \\ 
##   & (5.229) & (6.091) & (5.197) \\ 
##   topic\_2 & $-$2.775 & $-$0.492 & $-$3.942 \\ 
##   & (5.653) & (8.367) & (6.147) \\ 
##   topic\_3 & $-$0.306 & 0.861 & 3.350 \\ 
##   & (6.807) & (6.909) & (3.473) \\ 
##   topic\_4 & $-$4.477 & $-$3.617 & 0.369 \\ 
##   & (5.874) & (5.953) & (6.049) \\ 
##   topic\_5 & $-$2.613 & $-$3.205 & 5.475 \\ 
##   & (3.238) & (4.451) & (5.984) \\ 
##   Director & $-$3.409 & $-$2.528 & $-$1.432 \\ 
##   & (9.559) & (9.943) & (7.164) \\ 
##   Constant & 73.839$^{***}$ & 48.518$^{***}$ & 63.628$^{***}$ \\ 
##   & (9.099) & (10.045) & (6.047) \\ 
##  \hline \\[-1.8ex] 
## Observations & 56 & 56 & 55 \\ 
## R$^{2}$ & 0.057 & 0.033 & 0.162 \\ 
## Adjusted R$^{2}$ & $-$0.059 & $-$0.085 & 0.057 \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{3}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
Regression <- Regression %>%
  mutate(
    International_outlook_num = International_outlook %>%
      as.character() %>%
      str_replace(",", ".") %>%     # convert decimal comma to dot
      str_replace_all("[^0-9\\.\\-]", "") %>%  # drop stray chars like % or spaces
      as.numeric()
  )

Regression %>%
  mutate(Group = if_else(Country == "United States", "US", "Non-US")) %>%
  group_by(Group) %>%
  summarise(
    mean_international_outlook = mean(International_outlook_num, na.rm = TRUE),
    n = sum(!is.na(International_outlook_num)),
    .groups = "drop"
  )
## # A tibble: 2 × 3
##   Group  mean_international_outlook     n
##   <chr>                       <dbl> <int>
## 1 Non-US                       75.4    56
## 2 US                           60.5    82
topic_cols <- grep("^topic_", names(Regression), value = TRUE)

avg_topics_obs <- Regression %>%
  mutate(Group = if_else(Country == "United States", "US", "Non-US")) %>%
  group_by(Group) %>%
  summarise(across(all_of(topic_cols), ~ mean(.x, na.rm = TRUE))) %>%
  pivot_longer(-Group, names_to = "topic", values_to = "mean_share")

avg_topics_obs
## # A tibble: 12 × 3
##    Group  topic   mean_share
##    <chr>  <chr>        <dbl>
##  1 Non-US topic_1     0.175 
##  2 Non-US topic_2     0.318 
##  3 Non-US topic_3     0.0922
##  4 Non-US topic_4     0.179 
##  5 Non-US topic_5     0.128 
##  6 Non-US topic_6     0.109 
##  7 US     topic_1     0.155 
##  8 US     topic_2     0.0477
##  9 US     topic_3     0.140 
## 10 US     topic_4     0.120 
## 11 US     topic_5     0.189 
## 12 US     topic_6     0.348
library(ggplot2)
library(stringr)
library(tidyr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following object is masked from 'package:purrr':
## 
##     discard
avg_order <- avg_topics_obs %>%
  pivot_wider(names_from = Group, values_from = mean_share) %>%
  mutate(abs_diff = abs(US - `Non-US`)) %>%
  arrange(desc(abs_diff)) %>%
  pull(topic)

avg_topics_obs %>%
  mutate(topic = factor(topic, levels = avg_order)) %>%
  ggplot(aes(x = topic, y = mean_share, fill = Group)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  labs(x = NULL, y = "Average topic share", fill = NULL) +
  theme_minimal(base_size = 25) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.title = element_blank(),  # remove legend title
    legend.text  = element_text(size = 30, lineheight = 1.4),  # more spacing between names
    legend.key.size = grid::unit(1.2, "cm"),
    legend.spacing.y = grid::unit(1.25, "cm")                  # more vertical gap between items
  )

ggsave(
  filename = "Topic_Distribution_US_Non_US.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)
  

if(!"Continent" %in% names(Regression)) {
  library(countrycode)
  Regression <- Regression %>%
    mutate(Continent = countrycode(Country, origin = "country.name", destination = "continent"))
}

# Average topic shares by continent (observation-weighted)
avg_topics_cont <- Regression %>%
  filter(!is.na(Continent)) %>%
  group_by(Continent) %>%
  summarise(across(all_of(topic_cols), ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>%
  pivot_longer(-Continent, names_to = "topic", values_to = "mean_share")

ggplot(avg_topics_cont, aes(x = Continent, y = mean_share, fill = topic)) +
  geom_col(position = "fill", width = 0.8) +
  #scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(x = NULL, y = "Average topic share", fill = "Topic") +
  theme_minimal(base_size = 25) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),

   # legend.title = element_text(face = "bold", size = 30),
    legend.title = element_blank(),  # remove legend title
    legend.text  = element_text(size = 30, lineheight = 1.4),  # more spacing between names
    legend.key.size = grid::unit(1.2, "cm"),
    legend.spacing.y = grid::unit(1.25, "cm")                  # more vertical gap between items
  )

ggsave(
  filename = "Stacked_Barplot_Topic_Distribution_Continent.jpg",
  path = "~/Desktop/Studium/Master/Master Thesis/Data Analysis/Plots", # For reproduction, specify your path
  width = 10, height = 6, dpi = 300
)