Scrap all text in p tag Under heading by Python

Scrap all text in p tag Under heading by Python

You can use a Python library such as BeautifulSoup to scrape all the text within <p> tags that are located under a specific heading. Here’s an example of how you could use BeautifulSoup to accomplish this:

from bs4 import BeautifulSoup
import requests

# Make a request to the website
response = requests.get('http://yourwebsite.com')

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the heading element
heading = soup.find("h2", text="Your heading text here")

# Find all p tags that are under the heading
paragraphs = heading.find_next_siblings("p")

# Extract the text from each p tag
for p in paragraphs:
    print(p.text)
This code will give you all the text of the <p> tags that come under the heading “Your heading text here” You can customize it to your requirements. Please note that this code snippet is a basic example and you might need to adjust it to adapt to your specific use case.

Scrap all text in p tag Under heading by Python

from bs4 import BeautifulSoup as bs
from requests import get

main = "http://www.wikipedia.org"
url = "https://en.wikipedia.org/wiki/Harry_Potter_and_the_Half-Blood_Prince"
response = get(url)
soup = bs(response.text, 'html.parser')
# collect plot in this list
plot = []

# soup.find("div", { "id" : "articlebody" })
# find the node with id of "Plot"
mark1 = soup.findAll("h2")[1]
mark = mark1.find(lambda tag: tag.name == 'span' and tag.get('id'))

# # walk through the siblings of the parent (H2) node
# # until we reach the next H2 node
for elt in mark.parent.nextSiblingGenerator():
    if elt.name == "h2":
        break
    if hasattr(elt, "text"):
        plot.append(elt.text)

# enjoy
# print("".join(plot))

joinedPall = "".join(plot)
print(joinedPall)

s

scrap all bold p under h2 or id or class of a webpage

from bs4 import BeautifulSoup as bs
from requests import get

# main = "http://www.wikipedia.org"
url = "https://wikitravel.org/en/Bhopal"
response = get(url)
soup = bs(response.text, 'html.parser')
# collect plot in this list

plot =[]
mark = soup.find(id="See")

# # # walk through the siblings of the parent (H2) node
# # # until we reach the next Hsz2 node
for elt in mark.parent.next_siblings:
    if elt.name == "h2":
        break
    if hasattr(elt, "text") and (elt.find('b')):
        plot.append(elt.text)
print(*plot,sep=('\n')) #Just to print the list in a readable way

v

scrap all p between two heding in wiki

from bs4 import BeautifulSoup as bs
from requests import get

main = "http://www.wikipedia.org"
url = "https://en.wikipedia.org/wiki/Harry_Potter_and_the_Half-Blood_Prince"
response = get(url)
soup = bs(response.text, 'html.parser')
# collect plot in this list
plot = []

# find the node with id of "Plot"
mark = soup.find(id="Plot")

# walk through the siblings of the parent (H2) node
# until we reach the next H2 node
for elt in mark.parent.nextSiblingGenerator():
    if elt.name == "h2":
        break
    if hasattr(elt, "text"):
        plot.append(elt.text)

# enjoy
print("".join(plot))

scrap all h2 and All p in a webpage

from bs4 import BeautifulSoup as bs
from requests import get


# read html code snippet
url = 'https://feniinformation.blogspot.com/2022/04/dddd.html'
response = get(url)
html = bs(response.text, 'html.parser')

# find the node with id of "WordSection1"
mark = html.find(class_="WordSection1")

# initialize container objects
headers = []
content = []

# get all h2 and p(with class MsoNormal) tags
find_headers = mark.find_all_next('h2')
find_content = mark.find_all_next('p', {'class': 'MsoNormal'})

# get text of tags
for head, cont in zip(find_headers, find_content):
   headers.append(head.text)
   content.append(cont.text)

# output lists
print(f"Headers = {headers} '\n\nContent = {content}")

print(headers[1])
print(content[1])

print(headers[2])
print(content[2])

best code:

single-

from bs4 import BeautifulSoup as bs
from requests import get

# read html code snippet
url = 'https://www.gobankingrates.com/reviews/security-service-fcu/'
response = get(url)
soup = bs(response.text, 'html.parser')

allh2soup = soup.find('div', {'class':'row article-container'})
allh2 = allh2soup.find_all("h2")
print(list(range(len(allh2))))
# print(allh2)

thislist = list(range(len(allh2)))
i = 0
while i < len(thislist):
  print(thislist[i])
  i = i + 1

h3plot1 = []
plot1 = []
ulplot1 = []
olplot1 = []
blockquoteplot1 = []

mark1 = soup.find_all("h2")[4]
print(mark1.text)

for h2 in mark1:
    for elt in h2.parent.nextSiblingGenerator():
        if elt.name == "h2":
            break

        if elt.name == "h3":
            h3plot1.append(elt.text)

        if elt.name == "p":
            plot1.append(elt.text)

        if elt.name == "ul":
            ulplot1.append(elt.text)
        if elt.name == "ol":
            olplot1.append(elt.text)

        if elt.name == "blockquote":
            blockquoteplot1.append(elt.text)

        # if hasattr(elt, "text") and (elt.find('b')):
        #     print(elt)

        # if elt.name == "p":
        #     print(elt)

print(h3plot1)

joinedplot1 = "".join(plot1)
print(joinedplot1)

joinedulolplot1 = "".join(ulplot1 + olplot1)
print(joinedulolplot1)

joinedblockquoteplot1 = "".join(blockquoteplot1)
print(joinedblockquoteplot1)

# if r.status_code == 201:

bulk1fully-

from bs4 import BeautifulSoup as bs
from requests import get

# read html code snippet
url = 'https://www.gobankingrates.com/reviews/security-service-fcu/'
response = get(url)
soup = bs(response.text, 'html.parser')

allh2soup = soup.find('div', {'class':'row article-container'})
allh2 = allh2soup.find_all("h2")
print(list(range(len(allh2))))
# print(allh2)

# thislist = list(range(len(allh2)))
# i = 0
# while i < len(thislist):
#   print(thislist[i])
#   i = i + 1

thislist = list(range(len(allh2)))

i = 11
while i < len(thislist):

  h3plot1 = []
  plot1 = []
  ulplot1 = []
  olplot1 = []
  blockquoteplot1 = []

  mark1 = soup.find_all("h2")[i]
  print(mark1.text)

  for h2 in mark1:
      for elt in h2.parent.nextSiblingGenerator():
          if elt.name == "h2":
              break

          if elt.name == "h3":
              h3plot1.append(elt.text)

          if elt.name == "p":
              plot1.append(elt.text)

          if elt.name == "ul":
              ulplot1.append(elt.text)
          if elt.name == "ol":
              olplot1.append(elt.text)

          if elt.name == "blockquote":
              blockquoteplot1.append(elt.text)

          # if hasattr(elt, "text") and (elt.find('b')):
          #     print(elt)

          # if elt.name == "p":
          #     print(elt)

  print(h3plot1)

  joinedplot1 = "".join(plot1)
  print(joinedplot1)

  joinedulolplot1 = "".join(ulplot1 + olplot1)
  print(joinedulolplot1)

  joinedblockquoteplot1 = "".join(blockquoteplot1)
  print(joinedblockquoteplot1)

  i = i + 1
# if r.status_code == 201:

bulk2-

Similar Posts

Leave a Reply

Your email address will not be published. Required fields are marked *