Age,Female,Male 0~10,5,6 11~20,15,14 21~30,20,22 31~40,25,24 41~50,25,28 51~60,30,26 61~70,20,18 71~80,15,16 81~90,10,12
Select option:
# 載入必要套件 #======================== library(dplyr) library(tidytext) library(stringr) library(udpipe) library(tidyr) library(topicmodels) library(tm) library(readr) # ======================== 2. Load Data Robustly ======================== csv_path <- "F:/RR/methodsxabstract2.csv" ncluster<-10 # in lda # Graceful reading with fallback if (!exists("df") || !"ArticleTitle" %in% colnames(df) || !"Abstract" %in% colnames(df)) { df <- read_csv("F:/RR/methodsxabstract2.csv") } nrow(df) if (!exists("df") || !"ArticleTitle" %in% colnames(df) || !"Abstract" %in% colnames(df)) { df <- read.csv(csv_path, stringsAsFactors = FALSE, fill = TRUE) } nrow(df) if (!all(c("ArticleTitle", "Abstract") %in% colnames(df))) { stop("Your CSV is missing required columns: ArticleTitle and Abstract") } nrow(df) # ======================== 3. Combine and Clean Text ======================== df <- df %>% mutate( CombinedText = paste(ArticleTitle, Abstract, sep = ". "), CombinedText = iconv(CombinedText, from = "", to = "UTF-8", sub = " "), CombinedText = gsub("[^[:print:]]", " ", CombinedText), CombinedText = str_trim(CombinedText) ) %>% filter(!is.na(CombinedText), nchar(CombinedText) > 10) %>% mutate(doc_id = row_number()) # ======================== 4. TF-IDF Keywords Extraction ======================== data("stop_words") df_clean <- df %>% unnest_tokens(word, CombinedText) %>% anti_join(stop_words, by = "word") %>% filter(!str_detect(word, "\\d")) %>% mutate(word = str_to_lower(word)) %>% filter(str_length(word) > 3) tfidf_keywords <- df_clean %>% count(doc_id, word, sort = TRUE) %>% bind_tf_idf(word, doc_id, n) %>% group_by(doc_id) %>% slice_max(tf_idf, n = 7) %>% summarise(Keywords_TFIDF = paste(unique(word), collapse = "; ")) %>% ungroup() df <- df %>% left_join(tfidf_keywords, by = "doc_id") # ======================== 5. UDPipe Model Loading ======================== ud_model_info <- udpipe_download_model(language = "english") ud_model <- udpipe_load_model(ud_model_info$file_model) cat("Taking more time to get the annotations........","\t") # ======================== 6. UDPipe Annotation ======================== annotations <- udpipe_annotate( ud_model, x = df$CombinedText, doc_id = df$doc_id ) anno_df <- as.data.frame(annotations) %>% filter(!is.na(doc_id)) # ======================== 7. RAKE Keywords Extraction ======================== cat("Generating UDPipe-based RAKE keywords...\n") # Clean annotation dataframe: remove NA or empty lemmas anno_df <- anno_df %>% filter(!is.na(lemma), lemma != "") if (nrow(anno_df) > 0) { # Add a column indicating which tokens we'll consider for RAKE anno_df <- anno_df %>% mutate(is_relevant = upos %in% c("NOUN", "ADJ", "PROPN", "VERB")) # Check if there's *any* relevant token if (any(anno_df$is_relevant)) { # Run RAKE extraction on relevant tokens rake_keywords <- keywords_rake( x = anno_df, term = "lemma", group = "doc_id", relevant = anno_df$is_relevant ) # Check if RAKE gave us any results if (!is.null(rake_keywords) && nrow(rake_keywords) > 0 && "doc_id" %in% colnames(rake_keywords)) { # Take top 7 per doc rake_top <- rake_keywords %>% group_by(doc_id) %>% slice_max(rake, n = 7, with_ties = FALSE) %>% summarise(Keywords_RAKE = paste(unique(keyword), collapse = "; ")) %>% ungroup() # Merge with main dataframe df <- df %>% left_join(rake_top, by = "doc_id") %>% mutate(Keywords_RAKE = ifelse(is.na(Keywords_RAKE), "NoKeywordsFound", Keywords_RAKE)) } else { cat("RAKE extraction returned no keywords at all. Assigning fallback.\n") df$Keywords_RAKE <- "NoKeywordsFound" } } else { cat("No relevant POS tags found (NOUN, ADJ, PROPN, VERB). Assigning fallback.\n") df$Keywords_RAKE <- "NoKeywordsFound" } } else { cat("No annotations found at all. Assigning fallback.\n") df$Keywords_RAKE <- "NoKeywordsFound" } # ======================== 8. LDA Topic Modeling ======================== corpus <- Corpus(VectorSource(df$CombinedText)) dtm <- DocumentTermMatrix( corpus, control = list( stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, wordLengths = c(3, Inf) ) ) lda <- LDA(dtm, k = ncluster, control = list(seed = 1234)) ###This line fits an LDA model to your document–term matrix (dtm) topics <- tidy(lda, matrix = "beta") ###This line converts the LDA model into a tidy (data.frame-like) format using the tidytext package. ###doc_topics <- tidy(lda, matrix = "gamma") ##That gives the probability that each document belongs to each topic (γ values). top_terms <- topics %>% group_by(topic) %>% slice_max(beta, n = 7) %>% summarise(Top_Terms = paste(term, collapse = "; ")) print(top_terms) # Assign dominant topic per document gamma <- tidy(lda, matrix = "gamma") doc_topics <- gamma %>% group_by(document) %>% slice_max(gamma, n = 1) %>% mutate(doc_id = as.integer(document)) %>% select(doc_id, topic) df <- df %>% left_join(doc_topics, by = "doc_id") library(dplyr) library(stringr) library(tidyr) # 1️⃣ Split into separate rows and rename top_terms_long <- top_terms %>% separate_rows(Top_Terms, sep = ";\\s*") %>% rename(term = Top_Terms) %>% group_by(topic) %>% mutate(term_rank = row_number()) %>% ungroup() # 2️⃣ Pick top N terms per topic for label top_terms_for_label <- top_terms_long %>% filter(term_rank <= 2) %>% group_by(topic) %>% summarise( topic_label = str_c(str_to_title(term), collapse = " and "), .groups = "drop" ) # 3️⃣ Join back to original top_terms for reference top_terms_labeled <- top_terms %>% left_join(top_terms_for_label, by = "topic") print(top_terms_labeled) df <- df %>% left_join(top_terms_labeled %>% select(topic, topic_label), by = "topic") # ======================== 9. View Final Result ======================== print( df %>% select(doc_id, ArticleTitle, Keywords_TFIDF, Keywords_RAKE, topic) %>% head(10) ) write.csv(df, "F:/RR/methodsxabstract_with_keywords_topics.csv", row.names = FALSE)