Экспорт страниц ЖЖ

Jun 23, 2016 16:43

Если кому надо сохранить историю своих постов со всеми картинками и коментариями на память или на случай если родной LiveJournal прикажет долго жить или уведут логин, то вот инструкция как это сделать с помощью языка R (https://www.r-project.org/).

Код слегка сыроват, но это первые шаги в R.

[Дальше]

myExportLJ <- function() {
###########  R code to export LiveJournal blog posts together with images to local drive ###########
###########  the code creates index file with all exported pages listed                  ###########
###########  Create img folder in default folder
library(XML) ## install.packages(XML)
library(data.table) ## install.packages("data.table")
# library(jpeg)
#local constants
######################################
LJ_blog <- "http://papasonik.livejournal.com/"
LJ_start_page <- "17466.html"
LJ_number_of_posts_to_retrieve<-540
######################################
LJ_current_file<-LJ_start_page
# template for output file with Index
index_file <- c("","",
                '',
                '',
                ""," Index list",""
)

tryCatch ({
  for (rep_count in 1:LJ_number_of_posts_to_retrieve) {

print(paste("Starting to read file: ", paste(LJ_blog, LJ_current_file, sep="")))

con <- url(paste(LJ_blog, LJ_current_file, sep=""))
    htmlCode <- readLines(con, n = -1L, ok = TRUE, warn = FALSE, skipNul = FALSE)
    close(con)

# prepare one large text
    oneliner<-paste(htmlCode, sep="", collapse="\n")

# get all external files
    filelist<-getHTMLExternalFiles(htmlCode, xpQuery = c("//img/@src", "//link/@href",
                                                         "//script/@href", "//embed/@src"),
                                   baseURL = docName(htmlCode), relative = FALSE,
                                   asNodes = FALSE, recursive = FALSE)

# List of all links to files into a table
    DT <- data.table(filename=filelist)

# get only JPG files from livejournal
    DT_files_url<- DT[filename %like% "http.*jpg"]

if (nrow(DT_files_url)>0)
    {
      for (i in 1:nrow(DT_files_url))
      {
        # extract file name with folder img added
        file_name <-paste("img/i", gsub("_jpg",".jpg",gsub(":","",gsub("/","_",gsub("\\.","_",gsub("^http","",DT_files_url[i]))))), sep="" )

if (regexpr("?gl",file_name, fixed=TRUE)>1) {
        file_name <- substr(file_name, 1, regexpr("?gl",file_name, fixed=TRUE)-1)
        }

# check if file exists
        if (file.exists(file_name)  )
        { print(paste("WARNING: file exists: ", file_name))
        } else
        {# download file one by one and save into img

if (regexpr("?gl",toString(DT_files_url[i]), fixed=TRUE)>1) {

file_jpg <- substr(toString(DT_files_url[i]), 1, regexpr("?gl",toString(DT_files_url[i]), fixed=TRUE)-1)
          } else { file_jpg <-toString(DT_files_url[i])
          }

if (regexpr("wikipedia",toString(DT_files_url[i]), fixed=TRUE)>1)
          {print(paste("Skipping JPG file: ", file_jpg))}
          else {
          print(paste("Downloading JPG file: ", file_jpg))
          print(paste("Saving JPG file to: ", file_name))
          download.file(file_jpg,destfile=file_name ,mode="wb")

# now - replace all file names with references to local files
          oneliner <- gsub(toString(DT_files_url[i]),file_name,oneliner)
          }
        }
      }
    }
    #extract name of the next page to load (previus back in time)
    pos<-26+nchar(LJ_blog)+regexpr(paste('class="prevnext">
    LJ_next_file_to_read <- substr(oneliner,pos,pos+100)
    LJ_next_file_to_read <- substr(LJ_next_file_to_read,1,4+regexpr('.html',LJ_next_file_to_read, fixed=TRUE))

# replace links to the HTML files uploaded from the server. (next and previous records)
    #
Previous Entry | Next Entry

oneliner<-gsub(paste('class="prevnext"> | [^<]*(?:<(?!/script>)[^<]*)*/svg>", "", oneliner, perl=T)
    htmlCodeNoScripts1 <- gsub("

жеже

Previous post Next post
Up